taxa_list <- c("Quercus", "Cupressaceae", "Ambrosia", "Morus", "Pinaceae", "Ulmus early", "Ulmus late", "Fraxinus", "Betula", "Poaceae early", "Poaceae late", "Acer", "Populus")
taxa_short_list <- str_split(taxa_list, pattern = " ", simplify = T) [, 1]
site_list <- c("NY", "SJ", "AT", "ST", "HT", "TP", "DT", "DV", "KC", "SL")
sitename_list <- c("New York", "San Jose", "Austin", "Seattle", "Houston", "Tampa", "Detroit", "Denver", "Kansas City", "St. Louis")
year_list <- 2018:2021

1 Data

1.1 NAB data

1.1.1 Read and clean data

nab_path <- "/data/ZHULAB/phenology/nab/2021-10-04/"

# read in station coordinates
station_df <- read_csv("/data/ZHULAB/phenology/nab/NAB stations.csv") %>% # this csv is manually typed
  mutate(id = row_number())

file_list <- list.files(path = nab_path, pattern = ".xlsx", full.names = T)
nab_df_list <- vector(mode = "list", length = length(file_list))
for (i in 1:length(file_list)) {
  file <- file_list[i]

  # read in all data
  dat <- read_excel(
    file,
    col_names = F
  )
  start_n <- which(dat[, 1] == "Date") # excel files have headings of different number of rows. Search for the row(s) that starts with "Date" as the start of data table(s).

  # read in meta data
  if (min(start_n) == 1) { # Some excel files have no meta data before data table
    meta_dat <- NA
    station <- NA
  } else { # If there are meta data before data table, read it as a vector
    meta_dat <- read_excel(
      file,
      col_names = F,
      n_max = start_n[1] - 1
    ) %>% pull()
    station <- meta_dat[2] %>%
      strsplit(split = ": ") %>%
      unlist() %>%
      tail(1) # extract station name from meta data vector
  }

  location <- file %>%
    strsplit(split = c("/")) %>%
    unlist() %>%
    tail(1) %>%
    strsplit(split = " \\(") %>%
    unlist() %>%
    head(1) # city name extracted from file name

  # get coordinates of station
  if (!is.na(station)) { # use both city and station name
    station_info <- data.frame(station = station, location = location) %>%
      stringdist_inner_join(station_df, by = c("location", "station"), max_dist = 20, distance_col = "distance") %>% # join with all entries in station info, measuring dissimilarity in station and location, because their names might be slightly different in both tables
      arrange(station.distance, location.distance) %>%
      head(1) %>% # there might be several close matches. take the closest.
      rename(station = station.y) %>%
      rename(location = location.y) %>%
      dplyr::select(-station.x, -location.x, -station.distance, -location.distance, -distance)
  } else { # use only city name if station name is not available
    station_info <- data.frame(location = location) %>%
      stringdist_inner_join(station_df, by = "location", max_dist = 20, distance_col = "distance") %>%
      arrange(distance) %>%
      head(1) %>%
      rename(location = location.y) %>%
      dplyr::select(-location.x, -distance)
  }

  # Extract pollen and spore data table
  if (length(start_n) == 1) { # only pollen, no spores
    colnum <- ncol(read_excel(
      file,
      skip = start_n[1] - 1
    )) # find number of columns

    pollen_dat <- read_excel(
      file,
      skip = start_n[1] - 1,
      col_types = c(
        "date",
        rep("numeric", colnum - 1)
      )
    )
    if (is.na(pollen_dat[2, 1])) { # meaning finer taxonomic resolutions available
      genus_names <- read_excel(
        file,
        skip = start_n[1] - 1
      ) %>%
        slice(1) %>% # read the first row with finer taxonomic resolution
        gather(key = "old", value = "new") %>% # coarse and fine taxonomy into long format
        mutate(new = case_when(
          is.na(new) ~ old,
          TRUE ~ new
        )) # sometimes only coarse taxonomy is present, then keep that

      pollen_dat <- read_excel(
        file,
        skip = start_n[1] - 1,
        col_types = c(
          "date",
          rep("numeric", colnum - 1)
        ),
        col_names = genus_names$new # change to new colnames with finer taxonomic resolution
      )
    }
    pollen_dat <- pollen_dat %>%
      filter(!is.na(Date)) %>% # filter out rows that are not count data
      gather(key = "taxa", value = "count", -Date) %>% # to long format
      filter(!str_detect(taxa, "Station")) %>% # filter out additional info that are not count data, like "Station Name", "Station Postal Code", "Station State"
      filter(!taxa %in% c("Comment", "WeatherNotes")) %>%
      mutate(group = "pollen") %>%
      rowwise() %>%
      group_by(Date, taxa, group) %>%
      summarize(count = sum(count)) %>%
      ungroup()

    # combine with station info
    nab_df_list[[i]] <- pollen_dat %>%
      cbind(station_info)
  }

  if (length(start_n) == 2) { # both pollen and spores
    colnum_pollen <- ncol(read_excel(
      file,
      skip = start_n[1] - 1,
      n_max = (start_n[2] - 2) - (start_n[1] + 1)
    ))

    pollen_dat <- read_excel(
      file,
      skip = start_n[1] - 1,
      n_max = (start_n[2] - 2) - (start_n[1] + 1),
      col_types = c(
        "date",
        rep("numeric", colnum_pollen - 1)
      )
    )

    if (is.na(pollen_dat[2, 1])) { # meaning finer taxonomic resolutions available
      genus_names <- read_excel(
        file,
        skip = start_n[1] - 1,
        n_max = (start_n[2] - 2) - (start_n[1] + 1)
      ) %>%
        slice(1) %>%
        gather(key = "old", value = "new") %>%
        mutate(new = case_when(
          is.na(new) ~ old,
          TRUE ~ new
        ))

      pollen_dat <- read_excel(
        file,
        skip = start_n[1] - 1,
        n_max = (start_n[2] - 2) - (start_n[1] + 1),
        col_types = c(
          "date",
          rep("numeric", colnum_pollen - 1)
        ),
        col_names = genus_names$new
      )
    }
    pollen_dat <- pollen_dat %>%
      filter(!is.na(Date)) %>%
      gather(key = "taxa", value = "count", -Date) %>%
      filter(!str_detect(taxa, "Station")) %>%
      filter(!taxa %in% c("Comment", "WeatherNotes")) %>%
      mutate(group = "pollen") %>%
      rowwise() %>%
      group_by(Date, taxa, group) %>%
      summarize(count = sum(count)) %>%
      ungroup()

    colname_spore <- ncol(read_excel(
      file,
      skip = start_n[2] - 1
    ))
    spore_dat <- read_excel(
      file,
      skip = start_n[2] - 1,
      col_types = c(
        "date",
        rep("numeric", colname_spore - 1)
      )
    ) %>%
      filter(!is.na(Date)) %>%
      gather(key = "taxa", value = "count", -Date) %>%
      filter(!str_detect(taxa, "Station")) %>%
      filter(!taxa %in% c("Comment", "WeatherNotes")) %>%
      mutate(group = "spore")

    if (nrow(spore_dat) != 0) {
      nab_df_list[[i]] <- bind_rows(pollen_dat, spore_dat) %>%
        cbind(station_info)
    } else {
      nab_df_list[[i]] <- pollen_dat %>%
        cbind(station_info)
    }
  }
  print(i)
}

nab_df <- bind_rows(nab_df_list)

write_rds(nab_df, file = "/data/ZHULAB/phenology/nab/nab_dat.rds")

1.1.2 Resolve taxonomy

nab_df <- read_rds("/data/ZHULAB/phenology/nab/nab_dat.rds")

## get all distinct taxa and clean up their names
nab_taxa_df <- nab_df %>%
  distinct(taxa) %>%
  filter(!taxa %in% c("Total Pollen Count", "Total Spore Count")) %>%
  rename(taxa_raw = taxa) %>%
  rowwise() %>%
  mutate(taxa_clean = str_split(taxa_raw, pattern = "/", simplify = T)[1]) %>% # some taxa have alternative names, e.g., Chenopodiaceae/Amaranthaceae
  mutate(taxa_clean = str_split(taxa_clean, pattern = " \\(", simplify = T)[1]) %>% # e.g., Asteraceae (Excluding Ambrosia and Artemisia)
  mutate(taxa_clean = str_replace(taxa_clean, pattern = "-type", "")) %>% # e.g., Leptosphaeria-type
  mutate(taxa_clean = str_replace(taxa_clean, pattern = "Unidentified ", "")) %>% # e.g., Unidentified Fungi
  mutate(taxa_clean = str_replace(taxa_clean, pattern = "Undifferentiated ", "")) %>% # e.g., Undifferentiated Ascospores
  mutate(taxa_clean = str_replace(taxa_clean, pattern = "Other ", "")) %>% # e.g., Other Grass Pollen
  mutate(taxa_clean = str_replace(taxa_clean, pattern = "spores", "mycota")) %>% # e.g., Undifferentiated Ascospores
  mutate(taxa_clean = str_replace(taxa_clean, pattern = " ", "")) %>% # remove space
  mutate(taxa_clean = case_when(
    taxa_clean == "Rusts" ~ "Pucciniales",
    taxa_clean == "Smuts" ~ "Myxomycetes",
    taxa_clean == "Dreshslera" ~ "Helminthosporium",
    (taxa_clean == "GrassPollen" | taxa_clean == "WeedPollen") ~ "Gramineae",
    taxa_clean == "TreePollen" ~ "Tracheophyta",
    taxa_clean == "Pollen" ~ "Viridiplantae",
    TRUE ~ taxa_clean
  )) %>% # manual cleaning
  arrange(taxa_clean)

## Use taxize to resolve taxonomy
# Find sources id
# gnr_datasources()

# match with names in databases
resolve_df <- nab_taxa_df %>%
  pull(taxa_clean) %>%
  gnr_resolve(data_source_ids = c(4, 11), with_context = T, best_match_only = T, fields = "all") %>% # NCBI and GBIF databases
  full_join(nab_taxa_df,
    by = c("user_supplied_name" = "taxa_clean")
  ) %>%
  rename(taxa_clean = user_supplied_name) %>%
  mutate(same = (taxa_clean == matched_name)) # check if all taxa names are valid

# some taxa are incorrectly identified as Metazoa. Resolve again for those.
resolve_df_correct <- resolve_df %>%
  filter(str_detect(classification_path, "Metazoa")) %>%
  pull(taxa_clean) %>%
  gnr_resolve(data_source_ids = c(4, 11), best_match_only = F, fields = "all") %>% # NCBI and GBIF. Keep all matches here, not only the best match.
  filter(!str_detect(classification_path, "Animalia")) %>%
  filter(!str_detect(classification_path, "Metazoa")) %>%
  rename(taxa_clean = user_supplied_name) %>%
  group_by(taxa_clean) %>%
  slice(1) %>%
  ungroup()

# Correct previously resolved taxonomy
resolve_df <- resolve_df %>%
  filter(!str_detect(classification_path, "Metazoa")) %>%
  bind_rows(resolve_df_correct) %>%
  arrange(taxa_clean)

# make sure all taxa are resolved
# resolve_df %>% filter(!same) %>% View()

# get full classification
taxa_id_df <- resolve_df %>%
  dplyr::select(taxa_clean, data_source_id, taxon_id) %>%
  distinct(taxa_clean, .keep_all = T) %>%
  mutate(data_source = case_when(
    data_source_id == 4 ~ "ncbi",
    data_source_id == 11 ~ "gbif"
  ))

taxa_class_df <- vector(mode = "list", length = nrow(taxa_id_df))
for (i in 1:nrow(taxa_id_df)) {
  taxa_class_df[[i]] <-
    classification(taxa_id_df$taxon_id[i], db = taxa_id_df$data_source[i])  [[1]] %>%
    as_tibble() %>%
    filter(rank %in% c("kingdom", "phylum", "class", "order", "family", "genus", "species")) %>%
    mutate(rank = factor(rank, levels = c("kingdom", "phylum", "class", "order", "family", "genus", "species"))) %>%
    dplyr::select(-id) %>%
    spread(key = "rank", value = "name") %>%
    mutate(taxa_clean = taxa_id_df$taxa_clean[i])
}
taxa_class_df <- bind_rows(taxa_class_df)

nab_taxa_df <- nab_taxa_df %>%
  left_join(taxa_class_df, by = "taxa_clean") %>%
  mutate(kingdom = str_replace(kingdom, "Plantae", "Viridiplantae")) %>% # manual correction
  mutate(kingdom = case_when(
    phylum == "Oomycota" ~ "Chromista",
    TRUE ~ kingdom
  ))

write_rds(nab_taxa_df, "/data/ZHULAB/phenology/nab/nab_taxa.rds")

Combine nab data and taxa information. Some preprocessing.

nab_df <- read_rds("/data/ZHULAB/phenology/nab/nab_dat.rds")
nab_taxa_df <- read_rds("/data/ZHULAB/phenology/nab/nab_taxa.rds")

nab_with_taxa_df <- nab_df %>%
  rename(taxa_raw = taxa) %>%
  left_join(nab_taxa_df, by = "taxa_raw") %>%
  rename(taxa = taxa_clean) %>%
  mutate(family = case_when(
    taxa_raw == "Total Pollen Count" ~ "Total",
    TRUE ~ family
  )) %>%
  mutate(genus = case_when(
    taxa_raw == "Total Pollen Count" ~ "Total",
    TRUE ~ genus
  )) %>%
  filter(kingdom == "Viridiplantae" | is.na(kingdom)) %>%
  group_by(Date, lat, lon, station,location, id, family, genus, taxa) %>%
  summarise(count = sum(count)) %>%
  ungroup() %>%
  mutate(date = as.Date(Date)) %>%
  dplyr::select(-Date)

Focus on ten cities. * Exceptions: Denver pollen data are from Colorado Springs; Austin pollen data are from Georgetown; Detroit pollen data are from Sylvania.

# summarize station info
meta_df <- nab_with_taxa_df %>%
  filter(taxa %in% taxa_short_list) %>% # limit to taxa studied
  drop_na(count) %>%
  group_by(station, location, lat, lon, id) %>%
  summarise(
    mindate = min(date),
    maxdate = max(date),
    n = n()
  ) %>%
  mutate(range = maxdate - mindate) %>%
  ungroup() %>%
  arrange(desc(n)) %>%
  mutate(site = NA)

# match NAB stations with cities studied
meta_df$site[meta_df$id == 26] <- "NY"
meta_df$site[meta_df$id == 7] <- "SJ"
meta_df$site[meta_df$id == 5] <- "AT"
meta_df$site[meta_df$id == 33] <- "ST"
meta_df$site[meta_df$id == 2] <- "HT"
meta_df$site[meta_df$id == 48] <- "TP"
meta_df$site[meta_df$id == 12] <- "DV"
meta_df$site[meta_df$id == 47] <- "DT"
meta_df$site[meta_df$id == 17] <- "KC"
meta_df$site[meta_df$id == 37] <- "SL"
meta_df <- meta_df %>%
  left_join(data.frame(site = site_list, sitename = sitename_list), by = "site")

# make map
p_pollen_map <- ggplot() +
  geom_polygon(data = map_data("state"), aes(x = long, y = lat, group = group), fill = "white") +
  geom_path(data = map_data("state"), aes(x = long, y = lat, group = group), color = "grey50", alpha = 0.5, lwd = 0.2) +
  theme_void() +
  geom_point(data = meta_df, aes(x = lon, y = lat), pch = 10, color = "black", cex = 3) +
  geom_label_repel(data = meta_df %>% filter(site %in% site_list), aes(x = lon, y = lat, label = sitename)) +
  geom_point(data = meta_df %>% filter(site %in% site_list), aes(x = lon, y = lat), pch = 10, color = "red", cex = 3) +
  coord_equal()
p_pollen_map

View pollen phenology in study sites.

nab_with_taxa_df %>%
  left_join(meta_df %>% dplyr::select(id, site, sitename), by = "id") %>%
  filter(site %in% site_list) %>%
  filter(taxa %in% taxa_short_list) %>%
  mutate(doy = format(date, "%j") %>% as.integer()) %>%
  mutate(year = format(date, "%Y") %>% as.integer()) %>%
  # filter(year %in% year_list) %>% 
  group_by(taxa, site) %>% 
  mutate(count=count %>% sqrt()) %>% 
  mutate(count_st=(count-min(count, na.rm = T))/(max(count, na.rm = T)-min(count, na.rm = T))) %>%
  ggplot() +
  geom_point(aes(x = doy, y = count_st, group = taxa, col = taxa), alpha=0.3) +
  facet_grid(cols = vars(taxa), rows = vars(sitename), scales = "free_y") +
  theme_classic()
## Warning in sqrt(.): NaNs produced
## Warning: Removed 7494 rows containing missing values (geom_point).

1.2 Plant location data

1.2.1 Tree inventory

Read in and clean data. Each tree inventory has a different format, so they have to be processed differently.

cl <- makeCluster(length(site_list), outfile = "")
registerDoSNOW(cl)

trees_df_list <- foreach(
  site = site_list,
  .packages = c("tidyverse", "rgdal")
) %dopar% {
  if (site == "KC" | site == "SL") {
    # Data from urban FIA
    # Download https://experience.arcgis.com/experience/3641cea45d614ab88791aef54f3a1849/page/Urban-Datamart/
    # Manual: https://www.fia.fs.fed.us/library/database-documentation/urban/dbDescription/Urban_FIADB_User_Guides_Database_Description_ver3-0_2021_03_17.pdf

    # tree table
    id_tree_df<-read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/UrbanFIA/ID_TREE.csv") %>%
    filter(statecd == 29) %>% # Missouri
      filter(countycd %in% case_when(site == "KC" ~ 95, site == "SL" ~ c(189, 510))) %>%
      dplyr::select(plotid, id = cn, spcd, statuscd) %>%
      group_by(id) %>%
      filter(sum(statuscd != 1) == 0) %>% # living trees
      ungroup() %>%
      distinct(id, .keep_all = T) %>% # in case one tree is surveyed repeatedly
      dplyr::select(-statuscd)

    # plot table
    id_plot_df<-read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/UrbanFIA/ID_PLOT.csv") %>%
    dplyr::select(plotid, lat, lon)

    # species reference table
    ref_sp_df<-read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/UrbanFIA/REF_SPECIES.csv") %>%
    dplyr::select(spcd, genus, species) %>%
      mutate(species = paste(genus, species))

    # join tables
    trees_df <- id_tree_df %>%
      left_join(id_plot_df, by = "plotid") %>%
      left_join(ref_sp_df, by = "spcd") %>%
      dplyr::select(-plotid, -spcd) %>%
      mutate(site = site)
  }
  if (site == "DT") {
    # Data from Dan Katz
    trees_df <- read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/Detroit_oak_pheno_obs_spring_2017.csv") %>%
      distinct(id = tree, species = Species, x, y)

    # reproject to WGS84
    pts <- SpatialPoints(trees_df[, c("x", "y")],
      proj4string = CRS("+init=EPSG:3857")
    )
    # +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null +wktext +no_defs
    pts_reproj <- spTransform(
      pts,
      CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")
    )

    trees_df <- cbind(trees_df %>% dplyr::select(-x, -y), coordinates(pts_reproj)) %>%
      rename(lat = y, lon = x) %>%
      as_tibble() %>%
      mutate(site = site)
  }
  if (site == "DV") {
    # Data from OpenTrees.org 
    trees_df<-read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/Tree_Inventory_Denver.csv") %>%
    dplyr::select(id = SITE_ID, species = SPECIES_BO, time = INVENTORY_DATE, lat = Y_LAT, lon = X_LONG) %>%
      arrange(desc(time)) %>%
      distinct(id, species, .keep_all = T) %>%
      dplyr::select(-time) %>%
      filter(lon != 0) %>%
      mutate(site = site)
  }
  if (site == "TP") {
    # Data from https://www.opentreemap.org/tampa/map/
    trees_df <- read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/Tree_Inventory_Tampa.csv") %>%
      dplyr::select(id = `Tree Id`, genus = Genus, species = Species, lat = `Point Y`, lon = `Point X`) %>% # checked that there is no repeated tree id
      mutate(species = paste0(genus, " ", species)) %>%
      dplyr::select(-genus) %>%
      mutate(site = site)
  }
  if (site == "HT") {
    # Data from  https://koordinates.com/layer/25245-houston-texas-street-tree-inventory/data/
    trees_df <- read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/Tree_Inventory_Houston/houston-texas-street-tree-inventory.csv") %>%
      dplyr::select(species_common = Species, X = Shape_X, Y = Shape_Y) %>%
      rowwise() %>%
      mutate(common = case_when(
        str_detect(species_common, ", spp.") ~ str_replace(species_common, ", spp.", ""), # for coarse common names, e.g., "Oak, spp."
        (str_detect(species_common, ", ") & !str_detect(species_common, ", spp.")) ~ paste0(str_split(species_common, ", ")[[1]][2], " ", str_split(species_common, ", ")[[1]][1]), # reverse order of common name, e.g., "Oak, Water"
        TRUE ~ species_common
      )) %>%
      ungroup() %>%
      distinct(X, Y, .keep_all = T) %>% # in case same tree is sampled repeatedly
      mutate(id = row_number())

    # reproject to WGS84
    pts <- SpatialPoints(trees_df[, c("X", "Y")],
      proj4string = CRS("+proj=merc +a=6378137 +b=6378137 +lat_ts=0 +lon_0=0 +x_0=0 +y_0=0 +k=1 +units=m +nadgrids=@null +wktext +no_defs")
    )
    pts_reproj <- spTransform(
      pts,
      CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")
    )

    trees_df <- cbind(trees_df %>% dplyr::select(-X, -Y), coordinates(pts_reproj)) %>%
      rename(lat = Y, lon = X)

    # Use taxize to match common name with scientific name
    if (!file.exists("/data/ZHULAB/phenology/occurrence/StreetTrees/Tree_Inventory_Houston/species_reference.csv")) {
      id2comm <- function(pageid) {
        common_matches <- eol_pages(taxonconceptID = pageid, common_names = TRUE)$vernacular
        if (is.null(common_matches)) {
          common_match <- ""
        } else {
          common_matches <- common_matches %>% filter(language == "en")
          if (nrow(common_matches) > 0) {
            common_match <- common_matches %>%
              pull(vernacularname) %>%
              table() %>%
              sort(decreasing = TRUE) %>%
              head(1) %>%
              names()
          }
        }
        return(common_match)
      }

      comm2sci_new <- function(common) {
        species <- comm2sci(common, simplify = T)[[1]]
        if (length(species) == 0) {
          res <- comm2sci(common, db = "eol", simplify = F)[[1]]
          if (length(res) == 0) {
            species <- NA
          } else {
            species <- res %>%
              as_tibble() %>%
              rowwise() %>%
              mutate(common_match = id2comm(pageid)) %>%
              ungroup() %>%
              stringdist_inner_join(data.frame(common_name = common), by = c("common_match" = "common_name"), max_dist = 20, distance_col = "distance") %>%
              arrange(distance, pageid) %>%
              head(1) %>%
              filter(common_match != "") %>%
              pull(name)
            if (length(species) == 0) {
              species <- NA
            }
          }
          print(paste(common, species))
        }

        if (is.na(species)) {
          species <- readline(prompt = paste0(species_df$common[i], ". Manually enter scientific name: "))
        }

        return(species)
      }

      species_df <- trees_df %>%
        distinct(common) %>%
        rowwise() %>%
        mutate(species = comm2sci_new(common))
      write_csv(species_df, "/data/ZHULAB/phenology/occurrence/StreetTrees/Tree_Inventory_Houston/species_reference.csv")
    } else {
      species_df <- read_csv("/data/ZHULAB/phenology/StreetTrees/Tree_Inventory_Houston/species_reference.csv")
    }

    trees_df <- trees_df %>%
      left_join(species_df, by = "common") %>%
      dplyr::select(-species_common, -common) %>%
      mutate(site = site)
  }
  if (site == "NY") {
    trees_df <- read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/Tree_Inventory_NewYork.csv") %>%
      dplyr::select(id = tree_id, species = spc_latin, lat = latitude, lon = longitude) %>% # checked that there is no repeated tree id
      mutate(site = site)
  }
  if (site == "AT") {
    trees_df <- read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/Tree_Inventory_Austin/Tree_Inventory_Austin.csv") %>%
      dplyr::select(id = OBJECTID, species = SPECIES, coordinates = the_geom) %>% # checked that there is no repeated tree id
      mutate(coordinates = str_replace(coordinates, "POINT \\(", "")) %>%
      mutate(coordinates = str_replace(coordinates, "\\)", "")) %>%
      rowwise() %>%
      mutate(
        lon = str_split(coordinates, pattern = " ", simplify = T)[1],
        lat = str_split(coordinates, pattern = " ", simplify = T)[2]
      ) %>%
      ungroup() %>%
      mutate(
        lon = as.numeric(lon),
        lat = as.numeric(lat)
      ) %>%
      dplyr::select(-coordinates) %>%
      mutate(site = site)
  }
  if (site == "SJ") {
    trees_df <- read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/Tree_Inventory_SanJose/Tree_Inventory_SanJose.csv") %>%
      dplyr::select(id = OBJECTID, species = NAMESCIENTIFIC, Y = Y, X = X) # checked that there is no repeated tree id

    # shape <- readOGR(dsn = "/data/ZHULAB/phenology/occurrence/StreetTrees/Tree_Inventory_SanJose/Street_Tree.shp")
    # proj4string(shape)
    projection_sj <- "+proj=lcc +lat_0=36.5 +lon_0=-120.5 +lat_1=38.4333333333333 +lat_2=37.0666666666667 +x_0=2000000.0001016 +y_0=500000.0001016 +datum=NAD83 +units=us-ft +no_defs"
    pts <- SpatialPoints(trees_df[, c("X", "Y")],
      proj4string = CRS(projection_sj)
    )
    pts_reproj <- spTransform(
      pts,
      CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")
    )

    trees_df <- cbind(trees_df %>% dplyr::select(-X, -Y), coordinates(pts_reproj)) %>%
      rename(lat = Y, lon = X) %>%
      mutate(site = site)
  }

  if (site == "ST") {
    trees_df <- read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/Tree_Inventory_Seattle.csv") %>%
      dplyr::select(id = OBJECTID, species = SCIENTIFIC_NAME, lat = Y, lon = X) %>% # checked that there is no repeated tree id
      mutate(site = site)
  }
  trees_df
}

stopCluster(cl)

trees_df <- bind_rows(trees_df_list) %>%
  rowwise() %>%
  mutate(genus = str_split(species, pattern = " ", simplify = T)[1]) %>% # get genus name from species name
  ungroup() %>%
  distinct(id, site, .keep_all = T) %>% # in case one tree is surveyed repeatedly
  mutate(genus_id = as.integer(as.factor(genus)))

Find family names from genus names. This step needs supervision.

genus_to_family<-trees_df %>%
  distinct(genus) %>% 
  drop_na() %>% 
  mutate(family=tax_name(sci = genus, get = "family", messages = F, db="ncbi")$family)
write_rds(genus_to_family, "/data/ZHULAB/phenology/occurrence/genus_to_family.rds")

Join with inventory data and save.

genus_to_family<-read_rds("/data/ZHULAB/phenology/occurrence/genus_to_family.rds")

trees_df<-trees_df %>% 
  left_join(genus_to_family, by="genus")
write_rds(trees_df, "/data/ZHULAB/phenology/occurrence/street_trees.rds")

1.2.2 Grass patch

# Land cover data manually downloaded from Sentinel-2 Land Use/ Land Cover Downloader
# https://www.arcgis.com/apps/instant/media/index.html?appid=fc92d38533d440078f17678ebc20e8e2
grass_path <- "/data/ZHULAB/phenology/occurrence/LULC/"
trees_df <- read_rds("/data/ZHULAB/phenology/occurrence/street_trees.rds")

cl <- makeCluster(length(site_list), outfile = "")
registerDoSNOW(cl)

grass_df_city_list <- foreach(
  siteoi = site_list,
  .packages = c("tidyverse", "raster")
) %dopar% {

  # get extent from tree inventory
  trees_df_city <- trees_df %>%
    filter(site == siteoi) %>%
    drop_na(lon, lat)
  bbox <- extent(min(trees_df_city$lon), max(trees_df_city$lon), min(trees_df_city$lat), max(trees_df_city$lat))

  grass_df_city_year_list <- vector(mode = "list")
  for (yearoi in year_list) {
    # get file name(s) for each site and year
    files <- list.files(paste0(grass_path, siteoi), pattern = paste0(yearoi, "0101-"), full.names = T)
    
    # read raster(s)
    ras <- raster(files)
    
    # crop to city
    bbox_sp <- as(bbox, "SpatialPolygons")
    projection(bbox_sp) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"
    bbox_reproj <- spTransform(bbox_sp, proj4string(ras))
    ras_cr <- crop(ras, bbox_reproj)

    # get coordinates of grass
    grass_df_year <- as.data.frame(ras_cr, xy = T) %>%
      `colnames<-`(c("x", "y", "class")) %>%
      filter(class == 11) %>%
      dplyr::select(-class) %>%
      mutate(year = yearoi)
    grass_df_city_year_list[[yearoi %>% as.character()]] <- grass_df_year
    print(paste0(siteoi, yearoi))
  }
  grass_df_city <- bind_rows(grass_df_city_year_list) %>%
    mutate(grass = 1) %>% # choose pixels that are grass in all years
    spread(key = "year", value = "grass") %>%
    drop_na() %>%
    dplyr::select(x, y) %>%
    sample_n(min(10000, nrow(.))) # subset when there are too many grass pixels

  # reproject
  grass_sp_reproj <- SpatialPoints(grass_df_city[, c("x", "y")],
    proj4string = CRS(proj4string(ras))
  )
  grass_sp <- spTransform(
    grass_sp_reproj,
    CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")
  )

  # get coordinates
  grass_df_city <- coordinates(grass_sp) %>%
    as_tibble() %>%
    `colnames<-`(c("lon", "lat")) %>%
    mutate(site = siteoi) %>%
    mutate(id = row_number()) %>%
    mutate(family = "Poaceae") %>%
    mutate(genus = "Unknown") %>%
    mutate(genus_id = 999)
  grass_df_city
}
stopCluster(cl)

grass_df <- bind_rows(grass_df_city_list)
write_rds(grass_df, "/data/ZHULAB/phenology/occurrence/grass.rds")

1.2.3 Ragweed observation

Get ragweed occurrence information from GBIF.

trees_df <- read_rds("/data/ZHULAB/phenology/occurrence/street_trees.rds")

cl <- makeCluster(length(site_list), outfile = "")
registerDoSNOW(cl)

ragweed_df_city_list <- foreach(
  siteoi = site_list,
  .packages = c("tidyverse", "raster", "sf", "spocc")
) %dopar% {
  # get extent
  trees_df_city <- trees_df %>%
    filter(site == siteoi) %>%
    drop_na(lon, lat)
  bbox <- extent(min(trees_df_city$lon), max(trees_df_city$lon), min(trees_df_city$lat), max(trees_df_city$lat))
  # make it a polygon
  bbox_sp <- as(bbox, "SpatialPolygons")
  projection(bbox_sp) <- "+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"

  # get gbif data
  res <- occ(
    query = "Ambrosia", from = "gbif", has_coords = TRUE, limit = 1e6,
    geometry = st_bbox(bbox_sp),
    date = c(as.Date("2018-01-01"), as.Date("2021-12-31")),
    gbifopts = list(
      hasGeospatialIssue = FALSE
    )
  )

  # get coordinates
  ragweed_df_city <- res$gbif$data[[1]] %>%
    dplyr::select(lon = longitude, lat = latitude, species) %>%
    mutate(site = siteoi) %>%
    mutate(family = "Asteraceae") %>%
    mutate(genus = "Ambrosia") %>%
    mutate(genus_id = 998)
  ragweed_df_city
}
stopCluster(cl)

ragweed_df <- bind_rows(ragweed_df_city_list)
write_rds(ragweed_df, "/data/ZHULAB/phenology/occurrence/ragweed.rds")

1.2.4 View plant occurrence data

Put tree, grass, and ragweed data in one data frame.

trees_df <- read_rds("/data/ZHULAB/phenology/occurrence/street_trees.rds")
grass_df <- read_rds("/data/ZHULAB/phenology/occurrence/grass.rds")
ragweed_df <- read_rds("/data/ZHULAB/phenology/occurrence/ragweed.rds")
plant_df <- bind_rows(trees_df, grass_df, ragweed_df)

Map relative position of tree inventory and nab station.

ggplot() +
  geom_polygon(data = map_data("state"), aes(x = long, y = lat, group = group), fill = "white") +
  geom_path(data = map_data("state"), aes(x = long, y = lat, group = group), color = "grey50", alpha = 0.5, lwd = 0.2) +
  theme_void() +
  geom_point(
    data = plant_df %>%
      filter(genus %in% taxa_short_list | family %in% taxa_short_list) %>%
      mutate(taxa = case_when(
        family %in% c("Poaceae", "Cupressaceae", "Pinaceae") ~ family,
        TRUE ~ genus
      )) %>%
      # filter(!taxa%in% c("Poaceae", "Ambrosia")) %>%
      group_by(site) %>%
      sample_n(100) %>%
      ungroup(),
    aes(x = lon, y = lat), col = "dark blue", alpha = 0.5
  ) +
  geom_point(data = meta_df %>% filter(site %in% site_list), aes(x = lon, y = lat), cex = 3, pch = 10, col = "red", alpha = 1) +
  # geom_label_repel(data=meta_df %>% filter(site %in% site_list), aes(x=lon, y=lat, label = sitename))+
  coord_equal()

Calculate distance from plants to NAB stations in the unit of km.

distance_df<-plant_df %>%
  filter(genus %in% taxa_short_list | family %in% taxa_short_list) %>% 
  left_join(meta_df %>% dplyr::select (site, sitelon=lon, sitelat=lat, sitename) %>% drop_na(), by="site") %>% 
  rowwise() %>% 
  mutate(distance=distm(c(lon, lat), c(sitelon, sitelat), fun = distHaversine) %>% as.numeric() %>% `/`(1000)) %>% # distance in the unit of km
  ungroup() %>% 
  group_by(site, sitename) %>% 
  summarise(mindist=min(distance),
            maxdist=max(distance),
            meandist=mean(distance))
write_rds(distance_df, "/data/ZHULAB/phenology/occurrence/distance_from_plants_to_nab_stations.rds")
read_rds("/data/ZHULAB/phenology/occurrence/distance_from_plants_to_nab_stations.rds")
## # A tibble: 10 x 5
## # Groups:   site [10]
##    site  sitename     mindist maxdist meandist
##    <chr> <chr>          <dbl>   <dbl>    <dbl>
##  1 AT    Austin      37.1        40.8    38.8 
##  2 DT    Detroit     83.1        98.1    92.2 
##  3 DV    Denver      67.0       134.     96.4 
##  4 HT    Houston     13.9        38.5    24.8 
##  5 KC    Kansas City  0.505      41.9    25.1 
##  6 NY    New York     0.0904     37.7    16.5 
##  7 SJ    San Jose     0.128      25.0    13.1 
##  8 SL    St. Louis    1.27       40.3    20.0 
##  9 ST    Seattle      0.0925     23.2     9.49
## 10 TP    Tampa        0.00420    45.4    12.1

Map plant occurrence in each city.

ggplot() +
  theme_classic() +
  geom_point(
    data = plant_df %>%
      filter(genus %in% taxa_short_list | family %in% taxa_short_list) %>%
      mutate(taxa = case_when(
        family %in% c("Poaceae", "Cupressaceae", "Pinaceae") ~ family,
        TRUE ~ genus
      )) %>%
      # filter(!taxa%in% c("Poaceae", "Ambrosia")) %>%
      group_by(taxa, site) %>%
      sample_n(min(100, n())) %>%
      ungroup() %>%
      left_join(meta_df %>% dplyr::select(site, sitename), by = "site"),
    aes(x = lon, y = lat, col = taxa), alpha = 0.3
  ) +
  facet_wrap(. ~ sitename, scales = "free")

1.3 NPN data

phenophases <- npn_phenophases()
species_list <- npn_species()

# download all NPN data for taxa studied
npn_path <- "/data/ZHULAB/phenology/NPN/"

cl <- makeCluster(length(site_list), outfile = "")
registerDoSNOW(cl)

npn_df_list <- vector(mode = "list")
for (taxaoi_short in taxa_short_list %>% unique()) {
  if (!file.exists(paste0(npn_path, taxaoi_short, ".rds"))) {
    spid <- species_list %>%
      filter(genus == taxaoi_short | family_name == taxaoi_short) %>%
      pull(species_id)

    npn_data <- npn_download_status_data(request_source = "YS", years = c(2000:2022), species_id = spid)

    write_rds(npn_data, paste0(npn_path, taxaoi_short, ".rds"))
  } else {
    npn_data <- read_rds(paste0(npn_path, taxaoi_short, ".rds"))
  }

  npn_taxa_df_list <- foreach(
    siteoi = site_list,
    .packages = c("tidyverse", "geosphere")
  ) %dopar% {
    lat_oi <- meta_df %>%
      filter(site == siteoi) %>%
      pull(lat) %>%
      mean()
    lon_oi <- meta_df %>%
      filter(site == siteoi) %>%
      pull(lon) %>%
      mean()

    npn_flower_df <- npn_data %>%
      filter(phenophase_status == 1) %>%
      filter(phenophase_description %in% c("Full pollen release (conifers)", "Pollen release (conifers)", "Pollen cones (conifers)", "Open pollen cones (conifers)", "Full flowering (50%)", "Flowers or flower buds", "Pollen release (flowers)"))

    if (nrow(npn_flower_df > 0)) {
      npn_flower_df <- npn_flower_df %>%
        rowwise() %>%
        mutate(distance = distm(c(longitude, latitude), c(lon_oi, lat_oi), fun = distHaversine) %>% as.numeric()) %>% # distance in the unit of m
        ungroup() %>%
        filter(distance <= 500000) %>% # within 500 km of the NAB station
        dplyr::select(date = observation_date, lon = longitude, lat = latitude, doy = day_of_year) %>%
        mutate(date = as.Date(date)) %>%
        mutate(site = siteoi)
    } else {
      npn_flower_df <- data.frame(lon = numeric(0), lat = numeric(0), doy = integer(0), date = character(0), site = character(0)) %>% mutate(date = as.Date(date))
    }

    npn_count_df <- npn_flower_df %>%
      group_by(site, date) %>%
      summarise(count = n()) %>%
      ungroup()

    print(paste0(taxaoi_short, ", ", siteoi))
    npn_count_df
  }

  npn_df_list[[taxaoi_short]] <- bind_rows(npn_taxa_df_list) %>%
    mutate(taxa = taxaoi_short)
}
stopCluster(cl)
npn_df <- bind_rows(npn_df_list)
write_rds(npn_df, paste0(npn_path, "npn_dat.rds"))

View flowering phenology (from NPN data) in study sites.

npn_path <- "/data/ZHULAB/phenology/NPN/"
npn_df_all<-read_rds(paste0(npn_path, "npn_dat.rds"))
npn_df_all %>%
  left_join(meta_df %>% dplyr::select(id, site, sitename), by = "site") %>%
  filter(site %in% site_list) %>%
  filter(taxa %in% taxa_short_list) %>%
  mutate(doy = format(date, "%j") %>% as.integer()) %>%
  mutate(year = format(date, "%Y") %>% as.integer()) %>%
  # filter(year %in% year_list) %>% 
  group_by(taxa, site) %>% 
  mutate(count_st=(count-min(count, na.rm = T))/(max(count, na.rm = T)-min(count, na.rm = T))) %>%
  ggplot() +
  geom_point(aes(x = doy, y = count_st, group = taxa, col = taxa), alpha=0.3) +
  facet_grid(cols = vars(taxa), rows = vars(sitename), scales = "free_y") +
  theme_classic()
## Warning: Removed 19 rows containing missing values (geom_point).

1.4 CHELSA data

# Downloaded from: https://chelsa-climate.org/bioclim/
# Documentation: https://chelsa-climate.org/wp-admin/download-page/CHELSA_tech_specification_V2.pdf

chelsa_path <- "/data/ZHULAB/phenology/CHELSA/"
# read in raster
tmean_ras <- raster(paste0(chelsa_path, "bio1.tif"))
ppt_ras <- raster(paste0(chelsa_path, "bio12.tif"))
vpdmax_ras <- raster(paste0(chelsa_path, "vpd_max.tif"))

# sites as points
meta_sf <- meta_df %>%
  drop_na(site) %>%
  dplyr::select(site, lon, lat) %>%
  st_as_sf(
    coords = c("lon", "lat"),
    crs = 4326
  )

# extract chelsa data at points
chelsa_df <- meta_sf %>%
  mutate(
    mat = raster::extract(tmean_ras, meta_sf),
    tap = raster::extract(ppt_ras, meta_sf),
    vpd = raster::extract(vpdmax_ras, meta_sf)
  ) %>%
  as_tibble() %>%
  dplyr::select(-geometry)

1.5 PlanetScope data

1.5.1 Order

User settings.

# Set API
# api_key = "REMOVED" #ysong67
api_key <- "REMOVED" # xcui12

# Metadata filters
cloud_lim <- 0.99 # percent from 0-1
item_name <- "PSScene4Band"
# PSOrthoTile, PSScene3Band, PSScene4Band, Sentinel2L1C
# (see https://developers.planet.com/docs/data/items-assets/)
product <- "analytic_sr"
# analytic_b1, analytic_b2
# (see https://developers.planet.com/docs/data/items-assets/)

ps_path <- "/data/ZHULAB/phenology/PlanetScope/"

Order data. Orders may fail. May have to order again in that case.

for (siteoi in site_list) {
  ps_path_site <- paste0(ps_path, siteoi, "/")
  dir.create(ps_path_site, recursive = T)
  dir.create(paste0(ps_path_site, "orders/"), recursive = T)

  # Set AOI (many ways to set this!) ultimately just need an extent()
  plant_df_site <- plant_df %>%
    filter(site == siteoi) %>%
    drop_na(lon, lat)
  bbox <- extent(min(plant_df_site$lon), max(plant_df_site$lon), min(plant_df_site$lat), max(plant_df_site$lat))

  for (year_download in 2017:2022) {
    order_df <- data.frame(year = integer(0), month = integer(0), id = character(0))
    if (year_download == 2022) {
      month_end <- 4
    } else {
      month_end <- 12
    }
    for (month_download in 1:month_end) {
      # Date range of interest
      start_year <- year_download
      end_year <- year_download
      date_start <- lubridate::floor_date(as.Date(paste0(year_download, "-", str_pad(month_download, 2, pad = "0"), "-01")), unit = "month")
      date_end <- lubridate::ceiling_date(as.Date(paste0(year_download, "-", str_pad(month_download, 2, pad = "0"), "-01")), unit = "month") - 1
      start_doy <- as.numeric(format(date_start, "%j"))
      end_doy <- as.numeric(format(date_end, "%j"))

      # Create order name
      order_name <- paste(siteoi, item_name, product, start_year, end_year, start_doy, end_doy, sep = "_")

      # Planet Orders API
      order_id <- planetR:::planet_order_request(
        api_key = api_key,
        bbox = bbox,
        date_start = date_start,
        date_end = date_end,
        start_doy = start_doy,
        end_doy = end_doy,
        cloud_lim = cloud_lim,
        item_name = item_name,
        product = product,
        order_name = order_name
      )

      # when there are too many items in an order, order id will not be returned. split order into a few groups in that case.
      if (!is.null(order_id)) {
        # store order name
        order_df <- order_df %>%
          bind_rows(data.frame(year = year_download, month = month_download, id = order_id))
      } else {
        item_num <- nrow(planet_search(
          bbox, start_doy, end_doy, date_end,
          date_start, cloud_lim, item_name, api_key
        ))
        group_num <- ceiling(item_num / 500)
        split_num <- ceiling(30 / group_num)
        doy_group <- split(seq(start_doy, end_doy), floor(seq(1:(end_doy - start_doy + 1)) / split_num))
        for (g in 1:length(doy_group)) {
          order_id <- planetR:::planet_order_request(
            api_key = api_key,
            bbox = bbox,
            date_start = date_start,
            date_end = date_end,
            start_doy = min(doy_group[[g]]),
            end_doy = max(doy_group[[g]]),
            cloud_lim = cloud_lim,
            item_name = item_name,
            product = product,
            order_name = order_name
          )
          if (!is.null(order_id)) {
            order_df <- order_df %>%
              bind_rows(data.frame(year = year_download, month = month_download, id = order_id))
          }
        }
      }

      print(paste(year_download, ", ", month_download))
    }
    dir.create(paste0(ps_path_site, "orders/"))
    write_rds(order_df, paste0(ps_path_site, "orders/", "order_", year_download, ".rds"))
  }
}

1.5.2 Download

for (siteoi in site_list) {
  ps_path_site <- paste0(ps_path, siteoi, "/")
  for (year_download in 2017:2022) {
    order_df <- read_rds(paste0(ps_path_site, "orders/", "order_", year_download, ".rds"))
    cl <- makeCluster(nrow(order_df), outfile = "")
    registerDoSNOW(cl)
    foreach(
      i = 1:nrow(order_df),
      .packages = c("stringr", "planetR", "httr")
    ) %dopar% {

      # Get order id
      month_download <- order_df$month[i]
      order_id <- order_df$id[i]

      # Date range of interest
      start_year <- year_download
      end_year <- year_download
      date_start <- lubridate::floor_date(as.Date(paste0(year_download, "-", str_pad(month_download, 2, pad = "0"), "-01")), unit = "month")
      date_end <- lubridate::ceiling_date(as.Date(paste0(year_download, "-", str_pad(month_download, 2, pad = "0"), "-01")), unit = "month") - 1
      start_doy <- as.numeric(format(date_start, "%j"))
      end_doy <- as.numeric(format(date_end, "%j"))

      # Set/Create Export Folder
      order_name <- paste(siteoi, item_name, product, start_year, end_year, start_doy, end_doy, sep = "_")
      exportfolder <- paste0(ps_path_site, order_name)
      dir.create(exportfolder, recursive = T, showWarnings = F)

      # Download
      Sys.sleep(i * 0.5) # Otherwise sending request to API at the same time may cause error
      planet_order_download_new(order_id, exportfolder, api_key = api_key, order_num = i, overwrite_opt = FALSE)

      print(paste(year_download, ", ", month_download))
    }

    stopCluster(cl)
  }
}

1.5.3 Retrieve time series at plant locations

cl <- makeCluster(50, outfile = "")
registerDoSNOW(cl)

iscomplete <- F
while (!iscomplete) { # restart when there is error, usually because of cluster connection issues
  iserror <- try(
    for (taxaoi in taxa_list) {
      taxaoi_short <- str_split(taxaoi, " ", simplify = T)[1]
      for (s in 1:length(site_list)) {
        # get plant locations
        siteoi <- site_list[s]
        plant_taxa_df <- plant_df %>%
          filter(site == siteoi) %>%
          filter(genus == taxaoi_short | family == taxaoi_short) %>%
          mutate(id = row_number()) %>%
          drop_na(lon, lat)

        # skip when there are too few plants
        if (taxaoi_short %in% c("Ambrosia", "Poaceae")) {
          min_sample_size <- 1
        } else {
          min_sample_size <- 1
        }

        if (nrow(plant_taxa_df) >= min_sample_size) {
          # plants as points
          plant_taxa_sp <- SpatialPoints(plant_taxa_df[, c("lon", "lat")],
            proj4string = CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")
          )

          if (!file.exists(paste0(ps_path, "analyses/ps_", siteoi, "_", taxaoi_short, ".rds"))) {
            # read reflectance data
            files <- list.files(path = paste0(ps_path, siteoi), pattern = ".*_SR_clip.tif$", recursive = T, full.names = T) %>% sort()
            nday <- length(files)
            nloc <- length(plant_taxa_sp)
            ps_mat <- foreach(
              f = 1:nday,
              .packages = c("raster"),
              .combine = "rbind"
            ) %dopar% {
              file <- files[f]
              ps_st <- stack(file)

              trees_sp_reproj <- spTransform(plant_taxa_sp, CRSobj = CRS(proj4string(ps_st)))

              ps_values <- cbind(raster::extract(ps_st, trees_sp_reproj), f, id = 1:nloc)
              print(paste0(f, " out of ", nday))
              ps_values[complete.cases(ps_values), ]
            }

            # read quality assessment data
            # 0 - fully usable data
            # other - potentially problematic/unusable data
            #
            # Full description is in Planet's documentation (Page 91, Section 2. UNUSABLE DATA MASK FILE).
            files <- list.files(path = paste0(ps_path, siteoi), pattern = ".*_udm_clip.tif$", recursive = T, full.names = T) %>% sort()
            nday <- length(files)
            nloc <- length(plant_taxa_sp)
            ps_mask_mat <- foreach(
              f = 1:nday,
              .packages = c("raster"),
              .combine = "rbind"
            ) %dopar% {
              file <- files[f]
              ps_ras <- raster(file)

              trees_sp_reproj <- spTransform(plant_taxa_sp, CRSobj = CRS(proj4string(ps_ras)))

              ps_values <- cbind(qa = raster::extract(ps_ras, trees_sp_reproj), f, id = 1:nloc)

              print(paste0(f, " out of ", nday))
              ps_values[complete.cases(ps_values), ]
            }
            colnames(ps_mask_mat) <- c("qa", "f", "id")
            stopCluster(cl)

            # get corresponding timing from file names
            time_df <- list.files(path = paste0(ps_path, siteoi), pattern = ".*_SR_clip.tif$", recursive = T) %>%
              sort() %>%
              str_split(pattern = "/", simplify = T) %>%
              data.frame() %>%
              dplyr::select(filename = X2) %>%
              rowwise() %>%
              mutate(time = strptime(paste0(str_split(filename, pattern = "_")[[1]][1], str_split(filename, pattern = "_")[[1]][2]), format = "%Y%m%d%H%M%OS")) %>%
              ungroup() %>%
              mutate(f = row_number()) %>%
              dplyr::select(-filename)

            # assign id to each plant
            coord_df <- coordinates(plant_taxa_sp) %>%
              as_tibble() %>%
              mutate(id = row_number())

            # join data
            ps_df <- ps_mat %>%
              as_tibble() %>%
              left_join(time_df, by = "f") %>%
              left_join(coord_df, by = "id") %>%
              mutate(
                red = red * 0.0001, # scaling following Dixon et al's code
                green = green * 0.0001,
                blue = blue * 0.0001,
                nir = nir * 0.0001
              ) %>%
              # mutate(evi=2.5* (nir-red) / (nir + 6*red - 7.5*blue + 1),
              #        gndvi=(nir-green)/(nir+green),
              #        ebi= (red + green + blue) / (green / blue * (red - blue + 1))) %>%
              left_join(ps_mask_mat %>% as_tibble(), by = c("id", "f")) %>%
              dplyr::select(-f)

            # save
            write_rds(ps_df, paste0(ps_path, "analyses/ps_", siteoi, "_", taxaoi_short, ".rds"))
          }
        }
      }
    }
  )

  if (class(iserror) != "try-error") {
    iscomplete <- T
  } else if (class(iserror) == "try-error") { # restart cluster
    iscomplete <- F
    closeAllConnections()
    cl <- makeCluster(50, outfile = "")
    registerDoSNOW(cl)
  }
}
stopCluster(cl)

Example time series.

site_vis <- "DT"
taxa_vis <- "Quercus"
set.seed(1)
read_rds(paste0(ps_path, "analyses/ps_", site_vis, "_", taxa_vis, ".rds")) %>%
  filter(id %in% ((.) %>% pull(id) %>% sample(4))) %>% # four random trees
  filter(qa == 0) %>%
  gather(key = "band", value = "value", -id, -time, -lon, -lat, -qa) %>%
  ggplot() +
  geom_point(aes(x = time, y = value, col = band), alpha = 0.25) +
  facet_wrap(. ~ id, ncol = 1) +
  theme_classic() +
  ggtitle(paste0("Taxa: ", taxa_vis, "; Site: ", site_vis))

1.6 HLS

2 Correlation betwen leafing and flowering phenology on individual level

2.1 Preparation

Some settings.

# set color palette
cols <- c("EVI (PS)" = "dark green", "G2R (PS)" = "yellow green", "EBI (PS)" = "orange", "pollen count (NAB)" = "dark red", "flower observation (USA-NPN)" = "dark orchid", "flowering frequency" = "dark blue", "flower observation (DT)" = "coral")

# possible green up and green down thresholds
thres_list_up <- seq(from = 0, to = 1, by = 0.1) %>% round(1)
thres_list_down <- seq(from = 1, to = 0.0, by = -0.1) %>% round(1)
thres_df <- bind_rows(
  data.frame(direction = "up", threshold = thres_list_up),
  data.frame(direction = "down", threshold = thres_list_down)
)

# read in manually determined window of flowering
flower_window_df <- read_csv("/data/ZHULAB/phenology/nab/flower_window.csv")

Is there even a typical phenological curve? This is a function to rule out curves that are flat.

flat_better <- function(ts, doy = (274 - 365):(365 + 151), k = 50) {
  fit_list <- vector(mode = "list")

  # fit simple linear regression model
  fit0 <- lm(ts ~ doy, data = data.frame(doy, ts))
  fit_list[[1]] <- data.frame(AIC = AIC(fit0, k = k), model = "fit0")

  # fit piecewise regression model to original model with different numbers of breakpoints
  try( # sometimes fitting may fail
    {
      fit1 <- segmented(fit0, seg.Z = ~doy, npsi = 1, it = 10, control = seg.control(seed = 42, fix.npsi = T))
      fit_list[[2]] <- data.frame(AIC = AIC(fit1, k = k), model = "fit1")
    },
    silent = T
  )

  try(
    {
      fit2 <- segmented(fit0, seg.Z = ~doy, npsi = 2, it = 10, control = seg.control(seed = 42, fix.npsi = T))
      fit_list[[3]] <- data.frame(AIC = AIC(fit2, k = k), model = "fit2")
    },
    silent = T
  )

  try(
    {
      fit3 <- segmented(fit0, seg.Z = ~doy, npsi = 3, it = 10, control = seg.control(seed = 42, fix.npsi = T))
      fit_list[[4]] <- data.frame(AIC = AIC(fit3, k = k), model = "fit3")
    },
    silent = T
  )

  # rank by AIC. k=50 sets a large penalty for more complex models
  aic_df <- bind_rows(fit_list) %>%
    arrange(AIC, model)

  # return 1 when straight line is the best fit
  better <- aic_df %>%
    head(1) %>%
    pull(model) == "fit0"
  return(better)
}

2.2 Compare with individual flowering observations (Detroit oaks)

Trees in detroit.

taxaoi <- "Quercus"
siteoi <- "DT"
yearoi <- 2017

taxaoi_short <- str_split(taxaoi, " ", simplify = T)[1]
flower_window <- seq(flower_window_df %>% filter(taxa == taxaoi) %>% pull(start),
  flower_window_df %>% filter(taxa == taxaoi) %>% pull(end),
  by = 1
)
thres_df_taxa <- thres_df %>% filter(direction == "up")
plant_taxa_df <- plant_df %>%
  filter(site == siteoi) %>%
  filter(genus == taxaoi_short | family == taxaoi_short) %>%
  mutate(id = row_number()) %>%
  drop_na(lon, lat)

# plant_df %>%
#   filter(site==siteoi) %>%
#   filter(genus==taxaoi_short|family==taxaoi_short) %>%
#   group_by(species) %>%
#   summarise(n=n()) %>%
#   ungroup() %>%
#   arrange(desc(n))

plant_taxa_df %>%
  ggplot() +
  geom_point(aes(x = lon, y = lat, col = species)) +
  theme_classic()

Flowering observations.

detroit_df <- read_csv("/data/ZHULAB/phenology/occurrence/StreetTrees/Detroit_oak_pheno_obs_spring_2017.csv")

detroit_df_ts <- detroit_df %>%
  left_join(detroit_df %>%
    distinct(tree) %>%
    mutate(id = row_number()),
  by = "tree"
  ) %>%
  left_join(trees_df %>% filter(site == "DT") %>% dplyr::select(id, lon, lat), by = c("tree" = "id")) %>% # trees_df uses "tree" as "id"
  left_join(detroit_df %>%
    group_by(tree) %>%
    arrange(julian_day) %>%
    summarize(
      mindoy = min(julian_day),
      # peak=findpeaks(flowering_interp, sortstr = T)[1,2] %>% as.numeric(),
      thres = which(flowering_interp >= 95) %>% min() # first day that reaches 95% flowering 
    ) %>%
    mutate(peak = thres + mindoy - 1) %>% # correct with starting doy
    ungroup() %>%
    dplyr::select(tree, peak), by = c("tree")) %>%
  arrange(peak) %>%
  mutate(id = as.character(id)) %>%
  mutate(date = as.Date("2017-01-01") + julian_day - 1) %>%
  mutate(peak_date = as.Date("2017-01-01") + peak - 1)
## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf

## Warning in min(.): no non-missing arguments to min; returning Inf
detroit_df_ts %>%
  filter(id %in% ((.) %>% distinct(id) %>% pull(id) %>% sample(6))) %>%
  ggplot() +
  geom_point(aes(x = date, y = flowering_raw), col = "coral") +
  geom_line(aes(x = date, y = flowering_interp)) +
  geom_vline(aes(xintercept = peak_date), col = "dark orchid") +
  facet_wrap(. ~ id, ncol = 1) +
  ylab("flowering status")+
  theme_classic()
## Warning: Removed 292 rows containing missing values (geom_point).

View EVI, NAB, and NPN data together for one tree.

# preprocess ps data
ps_df <- read_rds(paste0(ps_path, "analyses/ps_", siteoi, "_", taxaoi_short, ".rds"))
ps_df_proc <- ps_df %>%
  drop_na() %>%
  mutate(date = as.Date(time)) %>%
  mutate(
    year = format(time, "%Y") %>% as.integer(),
    doy = format(time, "%j") %>% as.integer(),
    hour = format(strptime(time, "%Y-%m-%d %H:%M:%S"), "%H") %>% as.integer()
  ) %>%
  filter(qa == 0) %>%
  group_by(id, lon, lat, date, year, doy) %>%
  summarise(
    blue = mean(blue),
    green = mean(green),
    red = mean(red),
    nir = mean(nir)
  ) %>%
  ungroup() %>%
  mutate(evi = 2.5 * (nir - red) / (nir + 6 * red - 7.5 * blue + 1)) %>%
  filter(evi > 0, evi <= 1) %>%
  filter(red > 0, green > 0, blue > 0)

# subset nab data
pollen_df <- nab_with_taxa_df %>%
  left_join(meta_df %>% dplyr::select(id, site), by = "id") %>%
  filter(site == siteoi) %>%
  filter(genus == taxaoi_short | family == taxaoi_short)

# subset npn data
npn_df <- npn_df_all %>%
  filter(site == siteoi) %>%
  filter(taxa == taxaoi_short)

# join ps, nab, and npn data
ts_df <- ps_df_proc %>%
  left_join(plant_taxa_df, by = c("id", "lon", "lat")) %>%
  dplyr::select(id, date,
    `EVI (PS)` = evi 
  ) %>%
  mutate(id = as.factor(id)) %>%
  full_join(pollen_df %>%
    dplyr::select(date, `pollen count (NAB)` = count) %>%
    mutate(id = "pollen"),
  by = c("date", "id")
  ) %>%
  full_join(npn_df %>%
    dplyr::select(date, `flower observation (USA-NPN)` = count) %>%
    mutate(id = "npn"),
  by = c("date", "id")
  ) %>%
  arrange(id, date) %>%
  mutate(doy = format(date, "%j") %>% as.numeric()) %>%
  mutate(year = format(date, "%Y") %>% as.numeric())

# focus on one year, spanning from day 274 in the previous year to day 151 in the next year
ts_df_subset <- ts_df %>%
  filter(doy != 366) %>%
  # filter(doy>start_doy,doy<=end_doy) %>%
  filter(year == yearoi | year == (yearoi - 1) | year == (yearoi + 1)) %>%
  mutate(doy = ifelse(doy > 273 & year == yearoi - 1, doy - 365, doy)) %>%
  mutate(year = ifelse(doy <= 0 & year == yearoi - 1, year + 1, year)) %>%
  mutate(doy = ifelse(doy < 152 & year == yearoi + 1, doy + 365, doy)) %>%
  mutate(year = ifelse(doy > 365 & year == yearoi + 1, year - 1, year)) %>%
  filter(year == yearoi) %>%
  gather(key = "var", value = "value", -date, -id, -doy, -year) %>%
  mutate(var = fct_relevel(var, levels = c("EVI (PS)", "G2R (PS)", "EBI (PS)", "pollen count (NAB)", "flower observation (USA-NPN)"))) %>% 
  arrange(doy)
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
## Warning: Unknown levels in `f`: G2R (PS), EBI (PS)
# join with flowering data
ts_df_subset <- ts_df_subset %>%
  bind_rows(detroit_df_ts %>% dplyr::select(id, date, lon, lat, value = flowering_raw, doy = julian_day) %>%
    mutate(
      var = "flower observation (DT)",
      year = 2017
    ))

# summarize quantiles on the site level
ts_df_subset_summary <- ts_df_subset %>%
  drop_na(value) %>%
  group_by(date, var, doy, year) %>%
  summarise(
    q1 = quantile(value, 0.05, na.rm = T),
    q2 = quantile(value, 0.5, na.rm = T),
    q3 = quantile(value, 0.95, na.rm = T),
    n = n()
  ) %>%
  filter(n > 1) %>%
  ungroup()

# visualize for one tree
idoi="1"
plant_sp<-plant_taxa_df %>% filter(id==idoi) %>% pull(species)
ts_df_subset %>%
  filter(id == idoi | id == "pollen" | id == "npn") %>%
  ggplot() +
  geom_point(aes(x = date, y = value, col = var)) +
  facet_wrap(. ~ var, ncol = 1, scales = "free_y") +
  scale_color_manual(values = cols) +
  ylab("")+
  ggtitle(paste0("Site: ", siteoi,"  Year: ", yearoi,"  ID: ", idoi, "  Species: ", plant_sp))+
  theme_classic()
## Warning: Removed 1113 rows containing missing values (geom_point).

Detect greenup at multiple thresholds.

get_doy<-function (plant_df, thres_df, ts_df, idoi) {
  plant_sp<-plant_df %>% filter(id==idoi) %>% pull(species)
  px_doy<-ts_df %>% filter(id==idoi) %>% filter(var=="EVI (PS)") %>% pull(doy)
  
  if (length(px_doy [px_doy>=1 & px_doy<=365])<50) {
    flower_df_up<-flower_df_down<-NULL
    print("too few data points")
  }  else {
    px_evi<-ts_df %>% filter(id==idoi) %>% filter(var=="EVI (PS)") %>% pull(value)
    px_evi_in<-approx(px_doy, px_evi, (274-365):(365+151), rule=1)$y
    
    px_evi_in_sm<-whitfun(px_evi_in,30)
    
    flatbetter<-flat_better(px_evi_in_sm, k=50)
    
    ### green down
    thres_list_down<-thres_df %>% filter(direction=="down") %>% pull(threshold)
    if (length(thres_list_down)==0) {
      flower_df_down<-NULL
    } else {
      max_evi<-quantile(px_evi_in_sm[(-274+365+1):(-274+365+365+1)], 1, na.rm = T)
      start_doy<-which(!is.na(px_evi_in_sm[(-274+365+1):(-274+365+365+1)])&px_evi_in_sm[(-274+365+1):(-274+365+365+1)]>=max_evi) %>% max()-274+365
      min_evi<-quantile(px_evi_in_sm[start_doy:length(px_evi_in_sm)], 0, na.rm = T)
      min_doy<-which(!is.na(px_evi_in_sm[start_doy:length(px_evi_in_sm)])& px_evi_in_sm[start_doy:length(px_evi_in_sm)]<=min_evi) %>% min()+start_doy
      end_doy<-min_doy
      
      param_ok2<- (end_doy>start_doy) & (!flatbetter)
      
      if (!param_ok2) {
        greendown_doy<-rep(NA, length(thres_list_down))
        start_doy<-NA
        end_doy<-NA
        print("not typical growth curve")
      } else {
        greendown_thres<-(max_evi-min_evi)*thres_list_down+min_evi
        greendown_doy<-rep(NA, length(greendown_thres))
        for (t in 1:length(greendown_thres)) {
          greendown_doy[t]<-which(px_evi_in_sm[start_doy:end_doy]<=greendown_thres[t]) %>% min()+start_doy
        }
        start_doy=start_doy+274-365
        end_doy=end_doy+274-365
        greendown_doy=greendown_doy+274-365
      }
      flower_df_down<-data.frame(id=idoi, start=start_doy, end=end_doy,direction="down", thres=thres_list_down, doy=greendown_doy) 
    }
    
    ### green up
    thres_list_up<-thres_df %>% filter(direction=="up") %>% pull(threshold)
    if (length(thres_list_up)==0) {
      flower_df_up<-NULL
    } else {
      max_evi<-quantile(px_evi_in_sm[(-274+365+1):(-274+365+365+1)], 1, na.rm = T)
      end_doy<-which(!is.na(px_evi_in_sm[(-274+365+1):(-274+365+365+1)])&px_evi_in_sm[(-274+365+1):(-274+365+365+1)]>=max_evi) %>% min()-274+365
      min_evi<-quantile(px_evi_in_sm[1:end_doy], 0.00, na.rm = T)
      min_doy<-which(!is.na(px_evi_in_sm[1:end_doy])& px_evi_in_sm[1:end_doy]<=min_evi) %>% max()
      start_doy<-min_doy
      
      param_ok2<- (end_doy>start_doy) & (!flatbetter)
      
      if (!param_ok2) {
        greenup_doy<-rep(NA, length(thres_list_up))
        start_doy<-NA
        end_doy<-NA
        print("not typical growth curve")
      } else {
        greenup_thres<-(max_evi-min_evi)*thres_list_up+min_evi
        greenup_doy<-rep(NA, length(greenup_thres))
        for (t in 1:length(greenup_thres)) {
          greenup_doy[t]<-which(px_evi_in_sm[start_doy:end_doy]>=greenup_thres[t]) %>% min()+start_doy 
        }
        start_doy=start_doy+274-365
        end_doy=end_doy+274-365
        greenup_doy=greenup_doy+274-365
      }
      flower_df_up<-data.frame(id=idoi, start=start_doy, end=end_doy,direction="up", thres=thres_list_up, doy=greenup_doy) 
    }
  }
  
  flower_df<-bind_rows(flower_df_up, flower_df_down)
  
  return (flower_df)
}
plot_tree<-function (plant_df, ts_df, flower_df, idoi)  {
  plant_sp<-plant_df %>% filter(id==idoi) %>% pull(species)
  
    p_1tree<-ggplot( )+
      geom_point(data=ts_df_subset %>% filter(id==idoi|id=="npn"|id=='pollen') %>% 
                   filter(var %in% c("EVI (PS)","pollen count (NAB)", "flower observation (USA-NPN)", "flower observation (DT)")),
                 aes(doy, value,group=as.factor(id), col=var))+
      theme_classic()+
      guides(col=F,
             alpha=F)+
      scale_color_manual(values=cols)+
      facet_wrap(.~var, ncol=1, scales = "free_y")+
      xlab("day of year")+
      ylab("")+
      ggtitle(paste0("Site: ", siteoi,"  Year: ", yearoi,"  ID: ", idoi, "  Species: ", plant_sp))
    if (nrow(flower_df)==0) {
      p_1tree
    } else {
      p_1tree<-p_1tree+
        geom_vline(data=flower_df, aes(xintercept = start), col="red", alpha=0.8)+
        geom_vline(data=flower_df, aes(xintercept = end), col="red", alpha=0.8)+
        geom_vline(data=flower_df, aes(xintercept = doy), col="red", alpha=0.3)
    }
    
  return (p_1tree)
}
idoi <- "1"
flower_df <- get_doy(plant_taxa_df, thres_df_taxa,ts_df_subset, idoi)
## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf
p_1tree <- plot_tree(plant_taxa_df, ts_df_subset, flower_df, idoi)
p_1tree
## Warning: Removed 1113 rows containing missing values (geom_point).

Repeat for all trees to get frequency distribution of green-up dates.

# get green-up/green-down doy for each tree
random_id <- unique(plant_taxa_df$id)
flower_df_list <- vector(mode = "list", length = length(random_id))
for (i in 1:length(random_id)) {
  blas_set_num_threads(1)
  omp_set_num_threads(1)

  idoi <- as.character(random_id)[i]

  flower_df <- get_doy(plant_taxa_df, thres_df_taxa,ts_df_subset, idoi)

  # print(paste0(i , " out of ", length(random_id)))
  flower_df_list[[i]] <- flower_df
}
## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf
## [1] "too few data points"
## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf
## [1] "too few data points"
## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf
## Warning in min(.): no non-missing arguments to min; returning Inf
## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf

## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in min(which(!is.na(x[(max_id + 1):length(x)]))): no non-missing
## arguments to min; returning Inf
# bind with species and site info
flower_doy_df <- bind_rows(flower_df_list) %>%
  left_join(plant_taxa_df %>% dplyr::select(id, species, lat, lon) %>% mutate(id = as.character(id)), by = "id") %>%
  mutate(site = siteoi, year = yearoi)

# summarize frequency on the site level
flower_freq_df_list <- vector(mode = "list", length = nrow(thres_df_taxa))
for (t in 1:nrow(thres_df_taxa)) {
  flower_freq_df_list[[t]] <- flower_doy_df %>%
    drop_na(start, end) %>%
    filter(
      direction == thres_df_taxa$direction[t],
      thres == thres_df_taxa$threshold[t]
    ) %>%
    group_by(doy, thres, direction) %>%
    summarise(count = n()) %>%
    ungroup() %>%
    mutate(freq = count / n())
}
flower_freq_df <- bind_rows(flower_freq_df_list)

# join frequency distribution with EVI, NAB and NPN data
flower_freq_df <- flower_freq_df %>%
  mutate(doy = factor(doy, levels = c((274 - 365):(365 + 151)))) %>%
  complete(doy, thres, direction, fill = list(count = 0, freq = 0)) %>%
  mutate(doy = doy %>% as.character() %>% as.numeric()) %>%
  arrange(doy) %>%
  full_join(ts_df_subset %>%
    filter(id == "pollen") %>%
    filter(var == "pollen count (NAB)") %>%
    filter(year == yearoi) %>%
    dplyr::select(doy, pollen = value),
  by = "doy"
  ) %>%
  full_join(ts_df_subset %>%
    filter(id == "npn") %>%
    filter(var == "flower observation (USA-NPN)") %>%
    filter(year == yearoi) %>%
    dplyr::select(doy, npn = value),
  by = "doy"
  ) %>%
  full_join(ts_df_subset_summary %>%
    filter(var == "EVI (PS)") %>%
    filter(year == yearoi) %>%
    dplyr::select(doy, evi = q2),
  by = "doy"
  ) %>%
  mutate(doy = as.numeric(doy)) %>%
  mutate(site = siteoi, year = yearoi) %>%
  drop_na(doy, thres)

Plot green-up doy at several thresholds vs. flower observations. (Only showing within the limit of day 110 to 160.)

flower_doy_df %>%
  filter(thres %in% c(0.4, 0.5, 0.6, 0.7, 0.8)) %>%
  left_join(detroit_df_ts %>% dplyr::distinct(id, peak) %>% filter(is.finite(peak)), by = "id") %>%
  ggplot() +
  geom_jitter(aes(x = peak, y = doy, group = interaction(direction, thres), col = interaction(direction, thres)), alpha = 1) +
  geom_smooth(aes(x = peak, y = doy, group = interaction(direction, thres), col = interaction(direction, thres)), method = "lm") +
  geom_abline(slope = 1, intercept = 0, col = "red") +
  ggpubr::stat_cor(
    aes(x = peak, y = doy, group = interaction(direction, thres),
        col = interaction(direction, thres),
        label = paste(..r.label.., ..p.label.., sep = "*`,`~")),
    p.accuracy = 0.05,
        label.x.npc = 0.7,
        label.y.npc = "top",
        show.legend=F
  )+
  coord_equal() +
  ylim(c(110, 160)) +
  xlim(c(110, 160)) +
  xlab ("flowering doy")+
  ylab ("green-up doy")+
  labs(col="direction and threshold")+
  theme_classic()
## Warning: Removed 77 rows containing non-finite values (stat_smooth).
## Warning: Removed 77 rows containing non-finite values (stat_cor).
## Warning: Removed 81 rows containing missing values (geom_point).

Repeat for all years and check correlation. Here only showing green-up at 50%. There were significantly positive correlations in three out of five years, suggesting that green-up and flowering time may be correlated on the individual level.

read_rds("/raid/users/ysong67/GitHub/phenology/RS4flower/output/data/Quercus/flowering day of year.rds") %>% 
  filter(site==siteoi) %>%
  bind_rows(flower_doy_df %>% mutate(year=yearoi)) %>% 
  filter(thres==0.5,
         direction=="up") %>% 
  left_join(detroit_df_ts %>% dplyr::distinct(id, peak)  %>% filter(is.finite(peak)), by="id") %>% 
  ggplot()+
  geom_jitter(aes(x=peak, y=doy, group=as.factor(year), col=as.factor(year)), alpha=1)+
  geom_smooth(aes(x=peak, y=doy, group=as.factor(year), col=as.factor(year)), method="lm")+
  geom_abline(slope = 1, intercept = 0, col="red")+
  ggpubr::stat_cor(
    aes(x = peak, y = doy, group = as.factor(year),
        col = as.factor(year),
        label = paste(..r.label.., ..p.label.., sep = "*`,`~")),
    p.accuracy = 0.05,
        label.x.npc = 0.7,
        label.y.npc = "top",
        show.legend=F
  )+
  coord_equal()+
  ylim(c(110,160 ))+
  xlim(c(110,160 ))+
  theme_classic()
## Warning: Removed 87 rows containing non-finite values (stat_smooth).
## Warning: Removed 87 rows containing non-finite values (stat_cor).
## Warning: Removed 89 rows containing missing values (geom_point).

Compare maps.

read_rds("/raid/users/ysong67/GitHub/phenology/RS4flower/output/data/Quercus/flowering day of year.rds") %>% 
  filter(site==siteoi) %>%
  bind_rows(flower_doy_df %>% mutate(year=yearoi)) %>% 
  filter(thres==0.5,
         direction=="up") %>% 
  dplyr::select(id, doy, lat, lon,year) %>% 
  mutate(year=as.factor(year)) %>% 
  bind_rows(detroit_df_ts %>% rename(doy=peak) %>% dplyr::distinct(id, doy, lat, lon) %>% filter(is.finite(doy)) %>% mutate(year="flower observation (DT)")) %>% 
    filter(doy<=160 & doy>=110) %>% 
  spread(key="year", value="doy") %>% 
  # drop_na() %>% 
  rowwise() %>% 
  mutate(lat=lat+rnorm(1, 0, 0.002),
         lon=lon+rnorm(1, 0, 0.002)) %>% 
  ungroup() %>% 
  gather(key="group", value="doy", -id, -lat, -lon)  %>% 
  group_by(group) %>% 
  mutate(doy=(doy-mean(doy, na.rm = T))/(quantile(doy, 0.95, na.rm = T)-quantile(doy, 0.05, na.rm = T))) %>% # standardize to between -0.5 and 0.5
  filter(doy>=-0.5 & doy<=0.5) %>% # remove outliers
  ungroup() %>% 
  ggplot()+
  geom_point(aes(x=lon, y=lat, col=doy))+
  facet_wrap(.~group, ncol=2)+
  coord_equal()+
  theme_classic()+
  scale_color_viridis_c(direction = -1)

apply tp hls in dt

3 Match pollen phenology with leafing phenology

Conceptual figure.

conceptual1<-image_read("/raid/users/ysong67/GitHub/phenology/RS4flower/output/figures/conceptual1.jpg")
p_conceptual1<-image_ggplot(conceptual1)
conceptual2<-image_read("/raid/users/ysong67/GitHub/phenology/RS4flower/output/figures/conceptual1.jpg")
p_conceptual2<-image_ggplot(conceptual2)
grid.arrange(p_conceptual1, p_conceptual2, ncol=1)

3.1 Characterize leafing phenology for all taxa and cities

cl <- makeCluster(50, outfile = "")
registerDoSNOW(cl)

for (taxaoi in taxa_list) {
  taxaoi_short<-str_split(taxaoi, " ", simplify = T)[1]
  flower_window<-seq(flower_window_df %>% filter(taxa==taxaoi) %>% pull(start),
                     flower_window_df %>% filter(taxa==taxaoi) %>% pull(end),
                     by=1)
  if (taxaoi %in% c("Ambrosia", "Ulmus late")) {
    thres_df_taxa<-thres_df %>% filter(direction=="down")
  } else if (taxaoi== "Poaceae early"  ) {
    thres_df_taxa<-thres_df %>% filter(threshold>=0.5|direction=="up")
  } else if (taxaoi== "Poaceae late"  ) {
    thres_df_taxa<-thres_df %>% filter(threshold>=0.5|direction=="down")
  } else {
    thres_df_taxa<-thres_df %>% filter(direction=="up")
  }
  flower_doy_df_siteyears_list<-flower_freq_df_siteyears_list<-vector(mode="list", length=length(site_list))
  for (s in 1:length(site_list)) {
    siteoi<-site_list[s]
    plant_taxa_df<-plant_df %>% 
      filter(site==siteoi) %>% 
      filter(genus==taxaoi_short|family==taxaoi_short) %>% 
      mutate(id=row_number()) %>% 
      drop_na(lon, lat)
    
    plant_df %>%
      filter(site==siteoi) %>%
      filter(genus==taxaoi_short|family==taxaoi_short) %>%
      group_by(species) %>%
      summarise(n=n()) %>%
      ungroup() %>%
      arrange(desc(n))
    
    if (taxaoi %in% c("Ambrosia","Poaceae")) {
      min_sample_size=10
    } else {
      min_sample_size=20
    }
    
    if (nrow(plant_taxa_df)>=min_sample_size) {
      set.seed(1)
      random_id<<-sample(plant_taxa_df$id, min(2000, length(unique(plant_taxa_df$id)))) %>% sort()
      
      # preprocess ps data
      ps_df<-read_rds(paste0(ps_path, "analyses/ps_",siteoi,"_",taxaoi_short,".rds")) 
      ps_df_proc<- ps_df%>% 
        drop_na() %>% 
        filter(id%in% random_id) %>%
        mutate(date=as.Date(time)) %>% 
        mutate(year=format(time, "%Y") %>% as.integer(),
               doy=format(time, "%j") %>% as.integer(),
               hour=format(strptime(time,"%Y-%m-%d %H:%M:%S"),'%H') %>% as.integer()) %>% 
        filter(qa==0) %>%
        group_by(id, lon, lat, date, year, doy ) %>% 
        summarise(blue=mean(blue),
                  green=mean(green),
                  red=mean(red),
                  nir=mean(nir)) %>% 
        ungroup() %>% 
        mutate(evi=2.5* (nir-red) / (nir + 6*red - 7.5*blue + 1)) %>% 
        filter(evi>0, evi<=1) %>% 
        filter(red>0, green>0, blue>0) 
      
      # subset nab data
      pollen_df<- nab_with_taxa_df %>% 
        left_join(meta_df %>% dplyr::select(id, site), by="id") %>% 
        filter(site==siteoi) %>% 
        filter(genus==taxaoi_short|family==taxaoi_short) 
      
      # subset npn data
      npn_df<-npn_df_all %>% 
        filter(site==siteoi) %>% 
        filter(taxa==taxaoi_short)
      
      ts_df<-ps_df_proc %>% 
        left_join(plant_taxa_df, by=c("id", "lon", "lat")) %>% 
        
        dplyr::select(id, date, `EVI (PS)`=evi #, 
                      # `G2R (PS)`=g2r, `EBI (PS)`=ebi
                      ) %>% 
        mutate(id=as.factor(id)) %>% 
        full_join(pollen_df %>% 
                    dplyr::select(date, `pollen count (NAB)`=count) %>% 
                    mutate(id="pollen") , 
                  by=c("date", "id") ) %>% 
        full_join(npn_df %>%
                    dplyr::select(date, `flower observation (USA-NPN)`=count) %>%
                    mutate(id="npn") ,
                  by=c("date", "id") ) %>%
        arrange(id, date) %>% 
        mutate(doy=format(date, "%j") %>% as.numeric()) %>% 
        mutate(year=format(date, "%Y") %>% as.numeric()) 
      
      flower_doy_df_years_list<-flower_freq_df_years_list<-vector(mode="list", length=length(year_list))
      for (y in 1:length(year_list)) {
        yearoi<-year_list[y]
        ts_df_subset<-ts_df %>% 
          filter(doy!=366) %>% 
          filter(year==yearoi|year==(yearoi-1)|year==(yearoi+1)) %>% 
          mutate(doy=ifelse(doy>273& year==yearoi-1, doy-365, doy)) %>% 
          mutate(year=ifelse(doy<=0 & year==yearoi-1, year+1, year)) %>% 
          mutate(doy=ifelse(doy<152& year==yearoi+1, doy+365, doy)) %>% 
          mutate(year=ifelse(doy>365 & year==yearoi+1, year-1, year)) %>% 
          filter(year==yearoi) %>% 
          gather(key="var", value="value", -date, -id, -doy, -year ) %>% 
          mutate(var=fct_relevel(var, levels=c( "EVI (PS)", "G2R (PS)","EBI (PS)", "pollen count (NAB)", "flower observation (USA-NPN)"))) %>% 
          mutate(alpha=case_when(var %in% c("EVI", "G2R")~0.05,
                                 TRUE~1)) %>% 
          arrange(doy)
        
        ts_df_subset_summary<- ts_df_subset%>%
          drop_na(value) %>%
          group_by(date, var, doy, year) %>%
          summarise(q1=quantile(value, 0.05, na.rm=T),
                    q2=quantile(value, 0.5, na.rm=T),
                    q3=quantile(value, 0.95, na.rm=T),
                    n=n()) %>%
          filter(n>1) %>%
          ungroup()
        
        flower_df_list<-
          foreach (i = 1:length(random_id),
                   .packages = c("tidyverse", "ptw","greenbrown", "EnvCpt","segmented", "RhpcBLASctl"))  %dopar% {
                     blas_set_num_threads(1)
                     omp_set_num_threads(1)
                     
                     debug=F
                     idoi <-  as.character(random_id)[i]
                     
                     if (debug) { 
                       set.seed(NULL)
                       idoi<-sample(random_id,1) %>% as.character()
                     }
                     
                     flower_df<-get_doy(plant_taxa_df,thres_df_taxa, ts_df_subset, idoi)
                     
                     if (debug) {
                       p_1tree<-plot_tree(plant_taxa_df, ts_df_subset, flower_df, idoi)
                     }
                     
                     print(paste0(i , " out of ", length(random_id)))
                     flower_df
                   }
        flower_doy_df<-bind_rows(flower_df_list) %>% 
          left_join(plant_taxa_df %>% dplyr::select(id, species, lat, lon) %>% mutate(id=as.character(id)), by="id") %>% 
          mutate(site=siteoi, year=yearoi)
        
        flower_freq_df_list<-vector(mode="list", length=nrow(thres_df_taxa))
        for (t in 1:nrow(thres_df_taxa)) {
          flower_freq_df_list[[t]]<-flower_doy_df %>% 
            drop_na(start, end) %>% 
            filter(direction==thres_df_taxa$direction[t],
                   thres==thres_df_taxa$threshold[t]) %>% 
            group_by(doy, thres,direction) %>%
            summarise(count=n()) %>%
            ungroup() %>% 
            mutate(freq=count/n())
        }
        flower_freq_df<-bind_rows(flower_freq_df_list)
        
        
        flower_freq_df<-flower_freq_df %>% 
          mutate(doy=factor(doy, levels=c((274-365):(365+151))) ) %>% 
          complete(doy, thres,direction,fill=list(count=0,freq=0)) %>%
          mutate(doy=doy %>% as.character() %>% as.numeric()) %>% 
          arrange(doy) %>% 
          full_join(ts_df_subset %>% 
                      filter(id=="pollen") %>% 
                      filter(var=="pollen count (NAB)") %>% 
                      filter(year==yearoi) %>% 
                      dplyr::select( doy, pollen=value),
                    by="doy") %>% 
          full_join(ts_df_subset %>%
                      filter(id=="npn") %>%
                      filter(var=="flower observation (USA-NPN)") %>%
                      filter(year==yearoi) %>%
                      dplyr::select( doy, npn=value),
                    by="doy") %>%
          full_join(ts_df_subset_summary %>% 
                      filter(var=="EVI (PS)") %>% 
                      filter(year==yearoi) %>% 
                      dplyr::select( doy, evi=q2),
                    by="doy"
          ) %>% 
          full_join(ts_df_subset_summary %>% 
                      filter(var=="G2R (PS)") %>% 
                      filter(year==yearoi) %>% 
                      dplyr::select( doy, g2r=q2),
                    by="doy"
          ) %>% 
          mutate(doy=as.numeric(doy)) %>% 
          mutate(site=siteoi, year=yearoi) %>% 
          drop_na(doy, thres)
        
        print(paste0(siteoi, ", ", yearoi))
        
        flower_doy_df_years_list[[y]]<-flower_doy_df 
        flower_freq_df_years_list[[y]]<-flower_freq_df
      }
      flower_doy_df_siteyears_list[[s]]<-bind_rows(flower_doy_df_years_list)
      flower_freq_df_siteyears_list[[s]]<-bind_rows(flower_freq_df_years_list)
    }
  }
  flower_doy_df<-bind_rows(flower_doy_df_siteyears_list) %>% 
    left_join(data.frame(site=site_list, sitename=sitename_list
                         # , region=region_list
                         ), by=c("site"))
  
  flower_freq_df<-bind_rows(flower_freq_df_siteyears_list) %>% 
    group_by(site, year,thres,direction) %>% 
    mutate(freq_sm=whit1(freq, 30)) %>% 
    mutate(pollen_in=na.approx(pollen, doy, na.rm=F, maxgap=14)) %>%  
    mutate(pollen_fill=replace_na(pollen_in, 0)) %>%
    mutate(pollen_sm=whitfun(pollen_fill,30)) %>%  
    ungroup() %>% 
    dplyr::select(-pollen_in, -pollen_fill) %>% 
    mutate(pollen=case_when(doy %in% flower_window~ pollen)) %>% 
    mutate(pollen_sm=case_when(doy %in% flower_window~ pollen_sm,
                               TRUE~0)) %>% 
    mutate(npn=case_when(doy %in% flower_window~ npn))
  
  # pollen climatology (historical mean of smoothed pollen count)
  pollen_clim<-flower_freq_df %>% 
    group_by(site,direction, thres,doy) %>%
    summarise(pollen_clim=mean(pollen_sm, na.rm=T)) 
  
  flower_freq_df<-flower_freq_df%>% 
    left_join(pollen_clim, by=c("site",  "direction","thres", "doy")) %>% 
    left_join(data.frame(site=site_list, sitename=sitename_list
                         # , region=region_list
                         ), by=c("site"))
  
  
  output_path<-paste0("/raid/users/ysong67/GitHub/phenology/RS4flower/output/data/", taxaoi,"/")
  dir.create(output_path, recursive = T)
  write_rds(flower_doy_df,paste0(output_path,"flowering day of year.rds" ))
  write_rds(flower_freq_df,paste0(output_path,"flowering frequency.rds" ) )
}
stopCluster(cl)

3.2 Tune threshold and time lag with pollen data

cl <- makeCluster(50, outfile = "")
registerDoSNOW(cl)
for (taxaoi in taxa_list) {
  taxaoi_short<-str_split(taxaoi, " ", simplify = T)[1]
  flower_window<-seq(flower_window_df %>% filter(taxa==taxaoi) %>% pull(start),
                     flower_window_df %>% filter(taxa==taxaoi) %>% pull(end),
                     by=1)
  if (taxaoi %in% c("Ambrosia", "Ulmus late")) {
    thres_df_taxa<-thres_df %>% filter(direction=="down")
  } else if (taxaoi== "Poaceae early"  ) {
    thres_df_taxa<-thres_df %>% filter(threshold>=0.5|direction=="up")
  } else if (taxaoi== "Poaceae late"  ) {
    thres_df_taxa<-thres_df %>% filter(threshold>=0.5|direction=="down")
  } else {
    thres_df_taxa<-thres_df %>% filter(direction=="up")
  }
  
  output_path<-paste0("/raid/users/ysong67/GitHub/phenology/RS4flower/output/data/", taxaoi,"/")
  
  # read in leafing phenology data
  flower_doy_df<-read_rds(paste0(output_path,"flowering day of year.rds" ))
  flower_freq_df<-read_rds(paste0(output_path,"flowering frequency.rds" ) )
  
  # standardize all data to be between 0 and 1
  flower_freq_df_standard<-flower_freq_df %>% 
    mutate(pollen=pollen %>% sqrt()) %>%
    mutate(pollen_sm=pollen_sm %>% sqrt()) %>%
    mutate(pollen_clim=pollen_clim %>% sqrt()) %>%
    group_by(site, sitename, thres) %>%
    mutate(freq=(freq-min(freq, na.rm = T))/(max(freq, na.rm = T)-min(freq, na.rm = T))) %>% 
    mutate(freq_sm=(freq_sm-min(freq_sm, na.rm = T))/(max(freq_sm, na.rm = T)-min(freq_sm, na.rm = T))) %>% 
    mutate(pollen=(pollen-min(pollen, na.rm = T))/(max(pollen, na.rm = T)-min(pollen, na.rm = T))) %>%
    mutate(pollen_clim=(pollen_clim-min(pollen_sm, na.rm = T))/(max(pollen_sm, na.rm = T)-min(pollen_sm, na.rm = T))) %>% 
    mutate(pollen_sm=(pollen_sm-min(pollen_sm, na.rm = T))/(max(pollen_sm, na.rm = T)-min(pollen_sm, na.rm = T))) %>% 
    mutate(npn=(npn-min(npn, na.rm = T))/(max(npn, na.rm = T)-min(npn, na.rm = T))) %>%
    mutate(evi=(evi-min(evi, na.rm = T))/(max(evi, na.rm = T)-min(evi, na.rm = T))) %>% 
    mutate(g2r=(g2r-min(g2r, na.rm = T))/(max(g2r, na.rm = T)-min(g2r, na.rm = T))) %>% 
    ungroup()
  
  # optimize threshold and lag
  flower_freq_df_check_list<-vector(mode="list", length=length(site_list))
  for (r in 1:length(site_list)) { # select a region, here each site is treated as an individual region
    regionoi<-site_list[r]
    flower_freq_df_check_region_list<-vector(mode="list", length=nrow(thres_df_taxa))
    
    for (t in 1:nrow(thres_df_taxa)) { # select a threshold
      flower_freq_df_standard_subset<-flower_freq_df_standard %>% 
        filter(site==regionoi) %>%
        filter(direction==thres_df_taxa$direction[t],
               thres==thres_df_taxa$threshold[t]) %>% 
        group_by(site, year) %>% 
        ungroup()
      sample_size<-flower_freq_df_standard_subset %>% filter(!is.na(pollen), pollen>0) %>% nrow()
      if (sample_size>=5*4) { # only do tuning when there are more than 20 none-zero pollen count. skip the taxa and site otherwise.
        pollen_sm_ts<- flower_freq_df_standard_subset %>% pull (pollen_sm) # smoothed pollen count, for tuning
        pollen_ts<- flower_freq_df_standard_subset %>% pull (pollen) # pollen count, for calculating accuracy
        
        pollen_clim_ts<- flower_freq_df_standard_subset %>% pull (pollen_clim) # climatology
        rmse_clim<-sqrt(mean((pollen_clim_ts-pollen_ts)^2, na.rm=T)) # rmse between climatology and pollen count
        
        lags_list<- -200:200 # possible lags to try
        flower_freq_df_check_thres_list<-
          foreach (l = 1:length(lags_list),
                   .packages = c("tidyverse")) %dopar% {
                     lag=lags_list[l]
                     if (lag<0) {
                       flower_freq_df_standard_subset_lag<- flower_freq_df_standard_subset %>% mutate(freq_sm=lead(freq_sm,n=-lag))  %>% mutate(freq_sm=replace_na(freq_sm, 0)) # shift leafing phenology earlier by "lag" days, fill the NA with 0
                     } else if (lag==0) {
                       flower_freq_df_standard_subset_lag<-flower_freq_df_standard_subset  %>% mutate(freq_sm=replace_na(freq_sm, 0))
                     } else if (lag>0) {
                       flower_freq_df_standard_subset_lag<-flower_freq_df_standard_subset %>% mutate(freq_sm=lag(freq_sm,n=lag)) %>% mutate(freq_sm=replace_na(freq_sm, 0)) # shift leafing phenology later by "lag" days, fill the NA with 0
                     }
                     freq_ts_lag<- flower_freq_df_standard_subset_lag %>% pull (freq_sm)
                     rmse<-sqrt(mean((freq_ts_lag-pollen_sm_ts)^2, na.rm=T)) # rmse between remotely-sensed flowering phenology and smoothed pollen count
                     rmse_ps<-sqrt(mean((freq_ts_lag-pollen_ts)^2, na.rm=T)) # rmse between remotely-sensed flowering phenology and pollen count
                     
                     print(paste0("region ", r, ", threshold ", t, ", lag ", l))
                     
                     data.frame(direction=thres_df_taxa$direction[t],thres=thres_df_taxa$threshold[t],lag=lag, rmse=rmse,rmse_ps=rmse_ps, rmse_clim=rmse_clim)
                   }
        flower_freq_df_check_region_list[[t]]<-bind_rows(flower_freq_df_check_thres_list) %>% 
          arrange(rmse) %>% # choose the lag giving the smallest rmse in the threshold
          head(1)
      } else {
        flower_freq_df_check_region_list[[t]]<-data.frame(thres=numeric(0),lag=numeric(0), rmse=numeric(0), rmse_ps=numeric(0),rmse_clim=numeric(0))
      }
    }
    flower_freq_df_check_list[[r]]<-bind_rows(flower_freq_df_check_region_list) %>% 
      mutate(region=regionoi)  %>%
      arrange((rmse))
  }
  flower_freq_df_check<-bind_rows(flower_freq_df_check_list)
  # flower_freq_df_check
  write_rds(flower_freq_df_check,paste0(output_path,"accuracy check.rds") )
  
  best_thres<-flower_freq_df_check %>% 
    group_by(direction, thres) %>% 
    summarise(rmse=median(rmse)) %>% # median rmse for each threshold
    arrange(rmse) %>% 
    head(1) %>% # keep threshold giving the smallest median rmse
    dplyr::select(direction,thres)
  
  # flower_freq_df_check %>% filter(direction==best_thres$direction[1],
  #                                 thres==best_thres$thres[1])
  
  # getting time series with best threshold and lag
  flower_freq_df_standard_best_list<-vector(mode="list", length=length(site_list))
  for (r in 1:length(site_list)) {
    regionoi<-site_list[r]
    lagoi<-flower_freq_df_check %>% filter(direction==best_thres$direction,thres==best_thres$thres, region==regionoi) %>% pull(lag)
    if (length(lagoi)>0) {
      flower_freq_df_standard_best<-flower_freq_df_standard %>% 
        filter(site==regionoi) %>% 
        filter(direction==best_thres$direction,
               thres == best_thres$thres) %>%
        ungroup() %>%
        group_by(site, year)
      
      if (lagoi<0) {
        flower_freq_df_standard_best<- flower_freq_df_standard_best %>% mutate(freq_sm=lead(freq_sm,n=-lagoi)) %>% mutate(freq_sm=replace_na(freq_sm, 0))
      } else if (lagoi==0) {
        flower_freq_df_standard_best<- flower_freq_df_standard_best %>% mutate(freq_sm=replace_na(freq_sm, 0))
      } else if (lagoi>0) {
        flower_freq_df_standard_best<- flower_freq_df_standard_best %>% mutate(freq_sm=lag(freq_sm,n=lagoi)) %>% mutate(freq_sm=replace_na(freq_sm, 0))
      }  
      
      flower_freq_df_standard_best_list[[r]]<-flower_freq_df_standard_best %>% mutate(lag=lagoi)
    }
  }
  flower_freq_df_standard_best<-bind_rows(flower_freq_df_standard_best_list)
  
  # make plots
  flower_freq_comp<-ggplot(flower_freq_df_standard_best)+
    geom_point(aes(x=doy, y=npn, col="flower observation (USA-NPN)"), alpha=0.5)+
    geom_point(aes(x=doy, y=pollen, col="pollen count (NAB)"))+
    geom_line(aes(x=doy, y=pollen_clim, col="pollen count (NAB)"),alpha=0.5, lwd=1)+
    geom_point(aes(x=doy, y=evi, col="EVI (PS)"), alpha=0.2)+
    geom_line(aes(x=doy, y=freq_sm, col="flowering frequency"), lwd=1)+
    theme_classic()+
    facet_wrap(.~paste0(sitename, " (Lag: ", lag, ")")*year, ncol=4)+
    scale_color_manual(values=cols)+
    theme(legend.position="bottom")+
    theme(legend.title = element_blank())+
    ylab("")+
    ggtitle(paste0("Taxa: ",taxaoi," (Threshold: ",best_thres$direction, " ",best_thres$thres, ")"))
  flower_freq_comp
  jpeg(paste0(output_path, "flower frequency compared with other data.jpg"),
            height = 1600, width = 1600, res=150)
  print(flower_freq_comp)
  dev.off()
  
  flower_freq_corr<-ggplot(flower_freq_df_standard_best %>% mutate(year=as.factor(year)))+
    geom_point(aes(x=freq_sm, y=pollen , group=year, col=year))+
    geom_smooth(aes(x=freq_sm, y=pollen , group=year, col=year), method="lm", se=F, lwd=0.5)+
    geom_smooth(aes(x=freq_sm, y=pollen), method="lm")+
    theme_classic()+
    facet_wrap(.~sitename, ncol=4)+
    coord_equal()+
    xlim(0, 1)+
    ylim(0, 1)+
    ylab("pollen count^(1/2)")+
    xlab("flowering frequency")
  flower_freq_corr
  jpeg(paste0(output_path, "flower frequency and pollen count correlation.jpg"),
            height = 1200, width = 1600, res=150)
  print(flower_freq_corr)
  dev.off()
  
  across_site_and_year<-ggplot(flower_freq_df_standard_best %>%  
                                 mutate(year=as.factor(year)))+
    geom_point(aes(x=doy, y=pollen , group=year, col=year))+
    geom_line(aes(x=doy, y=freq_sm, group=year, col=year))+
    facet_wrap(.~paste0(sitename, " (Lag: ", lag, ")"), ncol=1)+
    theme_classic()+
    ylab("")+
    ggtitle(paste0("Taxa: ",taxaoi," (Threshold: ",best_thres$direction, " ",best_thres$thres, ")"))
  across_site_and_year
  
  jpeg(paste0(output_path, "phenology across site and year.jpg"),
            height = 1600, width = 1600, res=150)
  print(across_site_and_year)
  dev.off()
}
stopCluster(cl)

Display results in a Shiny app.

shinyApp(

  ui = fluidPage(
    selectInput("taxa", "Taxa:", choices = taxa_list),
    selectInput("plot", "Plot:", choices = c("flower frequency compared with other data",
                                             "phenology across site and year",
                                             "flower frequency and pollen count correlation")),
    plotOutput("plot")
  ),

  server = function(input, output) {
    img_file<-reactive(paste0("/raid/users/ysong67/GitHub/phenology/RS4flower/output/data/",input$taxa,"/",input$plot, ".jpg"))
    output$plot = renderPlot({
      image_ggplot(image_read(img_file()))
    })
  },

  options = list(height = 500)
)

Shiny applications not supported in static R Markdown documents

Compare accuracy with climatology.

flower_freq_df_check_list<-vector(mode="list") 
for (taxaoi in taxa_list) {
  output_path<-paste0("/raid/users/ysong67/GitHub/phenology/RS4flower/output/data/", taxaoi,"/")
  flower_freq_df_check_list[[taxaoi]]<-read_rds(paste0(output_path,"accuracy check.rds")) %>% 
    mutate(taxa=taxaoi)
}
flower_freq_df_check<-bind_rows(flower_freq_df_check_list)
best_thres<-flower_freq_df_check %>% group_by(taxa, thres) %>% summarise(rmse=median(rmse)) %>% arrange(rmse) %>% slice(1) %>% dplyr::select(taxa, thres)

accuracy_df<-flower_freq_df_check %>% 
  right_join(best_thres, by=c("taxa", "thres")) %>% 
  left_join(meta_df %>% dplyr::select(site, sitename), by=c("region"="site"))

accuracy_df %>% 
  dplyr::select(taxa, site=region, sitename, direction, thres, lag)
##             taxa site    sitename direction thres lag
## 1        Quercus   NY    New York        up   0.8 -18
## 2        Quercus   SJ    San Jose        up   0.8 -11
## 3        Quercus   AT      Austin        up   0.8 -18
## 4        Quercus   ST     Seattle        up   0.8 -14
## 5        Quercus   HT     Houston        up   0.8 -25
## 6        Quercus   TP       Tampa        up   0.8 -43
## 7        Quercus   DT     Detroit        up   0.8 -11
## 8        Quercus   DV      Denver        up   0.8  -3
## 9        Quercus   KC Kansas City        up   0.8 -18
## 10       Quercus   SL   St. Louis        up   0.8 -19
## 11  Cupressaceae   NY    New York        up   0.3 -22
## 12  Cupressaceae   SJ    San Jose        up   0.3  68
## 13  Cupressaceae   AT      Austin        up   0.3 -79
## 14  Cupressaceae   ST     Seattle        up   0.3  24
## 15  Cupressaceae   HT     Houston        up   0.3 -46
## 16  Cupressaceae   TP       Tampa        up   0.3 -35
## 17  Cupressaceae   DV      Denver        up   0.3   6
## 18  Cupressaceae   KC Kansas City        up   0.3 -20
## 19  Cupressaceae   SL   St. Louis        up   0.3 -16
## 20      Ambrosia   AT      Austin      down   1.0 134
## 21      Ambrosia   HT     Houston      down   1.0 126
## 22      Ambrosia   TP       Tampa      down   1.0  37
## 23      Ambrosia   DT     Detroit      down   1.0  38
## 24      Ambrosia   DV      Denver      down   1.0  71
## 25      Ambrosia   KC Kansas City      down   1.0  68
## 26      Ambrosia   SL   St. Louis      down   1.0  44
## 27         Morus   NY    New York        up   0.8 -16
## 28         Morus   SJ    San Jose        up   0.8 -23
## 29         Morus   AT      Austin        up   0.8 -21
## 30         Morus   HT     Houston        up   0.8 -29
## 31         Morus   TP       Tampa        up   0.8 -48
## 32         Morus   DV      Denver        up   0.8  -9
## 33         Morus   KC Kansas City        up   0.8 -11
## 34         Morus   SL   St. Louis        up   0.8 -21
## 35      Pinaceae   NY    New York        up   0.6  48
## 36      Pinaceae   SJ    San Jose        up   0.6  83
## 37      Pinaceae   ST     Seattle        up   0.6  19
## 38      Pinaceae   HT     Houston        up   0.6 -20
## 39      Pinaceae   TP       Tampa        up   0.6 -41
## 40      Pinaceae   DV      Denver        up   0.6  33
## 41      Pinaceae   SL   St. Louis        up   0.6  15
## 42   Ulmus early   NY    New York        up   0.8 -51
## 43   Ulmus early   SJ    San Jose        up   0.8 -15
## 44   Ulmus early   AT      Austin        up   0.8 -65
## 45   Ulmus early   ST     Seattle        up   0.8 -56
## 46   Ulmus early   HT     Houston        up   0.8 -67
## 47   Ulmus early   DV      Denver        up   0.8 -69
## 48   Ulmus early   KC Kansas City        up   0.8 -61
## 49   Ulmus early   SL   St. Louis        up   0.8 -58
## 50    Ulmus late   SJ    San Jose      down   0.6  13
## 51    Ulmus late   AT      Austin      down   0.6  44
## 52    Ulmus late   HT     Houston      down   0.6  80
## 53    Ulmus late   KC Kansas City      down   0.6 -30
## 54    Ulmus late   SL   St. Louis      down   0.6 -19
## 55      Fraxinus   SJ    San Jose        up   0.6 -20
## 56      Fraxinus   AT      Austin        up   0.6 -40
## 57      Fraxinus   ST     Seattle        up   0.6  -6
## 58      Fraxinus   HT     Houston        up   0.6 -12
## 59      Fraxinus   DV      Denver        up   0.6 -15
## 60      Fraxinus   KC Kansas City        up   0.6  -6
## 61      Fraxinus   SL   St. Louis        up   0.6  -6
## 62        Betula   NY    New York        up   0.8 -20
## 63        Betula   SJ    San Jose        up   0.8 -11
## 64        Betula   ST     Seattle        up   0.8 -32
## 65        Betula   HT     Houston        up   0.8 -43
## 66        Betula   DV      Denver        up   0.8 -12
## 67 Poaceae early   NY    New York        up   0.3  41
## 68 Poaceae early   SJ    San Jose        up   0.3 150
## 69 Poaceae early   AT      Austin        up   0.3  56
## 70 Poaceae early   ST     Seattle        up   0.3 126
## 71 Poaceae early   HT     Houston        up   0.3  50
## 72 Poaceae early   TP       Tampa        up   0.3  72
## 73 Poaceae early   DT     Detroit        up   0.3  81
## 74 Poaceae early   DV      Denver        up   0.3  75
## 75 Poaceae early   KC Kansas City        up   0.3  53
## 76 Poaceae early   SL   St. Louis        up   0.3  47
## 77  Poaceae late   NY    New York      down   0.3 -54
## 78  Poaceae late   SJ    San Jose      down   0.3  85
## 79  Poaceae late   AT      Austin      down   0.3 -54
## 80  Poaceae late   HT     Houston      down   0.3 -41
## 81  Poaceae late   TP       Tampa      down   0.3 -60
## 82  Poaceae late   DT     Detroit      down   0.3 -73
## 83  Poaceae late   DV      Denver      down   0.3 -61
## 84  Poaceae late   KC Kansas City      down   0.3 -84
## 85  Poaceae late   SL   St. Louis      down   0.3 -60
## 86          Acer   ST     Seattle        up   0.9 -30
## 87          Acer   HT     Houston        up   0.9 -45
## 88          Acer   DV      Denver        up   0.9 -68
## 89          Acer   KC Kansas City        up   0.9 -70
## 90          Acer   SL   St. Louis        up   0.9 -77
## 91       Populus   SJ    San Jose        up   0.7  -4
## 92       Populus   ST     Seattle        up   0.7 -36
## 93       Populus   HT     Houston        up   0.7 -18
## 94       Populus   DV      Denver        up   0.7 -16
## 95       Populus   KC Kansas City        up   0.7 -22
ggplot(accuracy_df)+
  geom_point(aes(x=region, y=rmse_ps), col="dark blue")+
  geom_point(aes(x=region, y=rmse_clim), col="dark red")+
  facet_wrap(.~taxa)+
  ylab("RMSE")+
  theme_classic()

accuracy_df %>% 
  mutate(rmse_diff=rmse_clim-rmse_ps) %>% 
  drop_na() %>% 
  group_by(taxa) %>% 
  summarize (min=min(rmse_diff),
             max=max(rmse_diff),
             mean=mean(rmse_diff),
             median=median(rmse_diff),
             proportion=sum(rmse_diff>0)/n()) %>% 
  ungroup()
## # A tibble: 13 x 6
##    taxa              min     max      mean    median proportion
##  * <chr>           <dbl>   <dbl>     <dbl>     <dbl>      <dbl>
##  1 Acer          -0.0423  0.0526  0.0215    0.0290        0.8  
##  2 Ambrosia      -0.0471  0.0192 -0.0237   -0.0302        0.143
##  3 Betula        -0.0612  0.105   0.0157    0.00645       0.6  
##  4 Cupressaceae  -0.0338  0.0789  0.0111    0.00363       0.778
##  5 Fraxinus      -0.0415  0.0872  0.0225    0.0249        0.714
##  6 Morus         -0.0273  0.0318  0.000910  0.00344       0.5  
##  7 Pinaceae      -0.0710  0.0542  0.00610   0.0133        0.714
##  8 Poaceae early -0.0724  0.0645  0.0134    0.0186        0.6  
##  9 Poaceae late  -0.186   0.0507 -0.0357   -0.0274        0.333
## 10 Populus       -0.107   0.0564  0.00583   0.0243        0.8  
## 11 Quercus       -0.0935  0.0639 -0.00322  -0.0108        0.4  
## 12 Ulmus early   -0.0500  0.0709 -0.00378  -0.000808      0.5  
## 13 Ulmus late    -0.0954 -0.0214 -0.0568   -0.0704        0

4 Infer relationship between leaf-flower lag with climate

Plot correlation between mean annual temperature (MAT) and lag between green-up/down frequency and pollen count. * A positive lag means leafing phenology leads pollen phenology; a negative lag means leafing phenology lags pollen phenology.

# taxa in chronological order
taxa_chron<-c("Cupressaceae" , "Fraxinus","Ulmus early"  ,"Pinaceae" ,"Acer", "Populus","Quercus",   "Betula","Morus",  "Poaceae early","Poaceae late"  ,    "Ulmus late" ,  "Ambrosia" )

# data frame with flowering frequency and climate info, grouped into early and late taxa
lag_clim_df<-accuracy_df %>% 
  dplyr::select(-rmse, -rmse_ps, -rmse_clim) %>% 
  left_join(chelsa_df, by=c("region"="site")) %>% 
  mutate(taxa=as.factor(taxa)) %>% 
  mutate(taxa=fct_relevel(taxa, levels=taxa_chron))%>%
  mutate(group=case_when(taxa %in% c("Ulmus late", "Poaceae late", "Ambrosia")~"late",
                                                                              TRUE~"early"))
## Warning: Outer names are only allowed for unnamed scalar atomic inputs
# plot lag vs. climate
ggplot(lag_clim_df)+
  geom_point(aes(x=mat, y=lag, col=region, group=region))+
  geom_smooth(aes(x=mat, y=lag), method="lm", se=F)+
  geom_smooth(aes(x=mat, y=lag), method="lm", alpha=0.5)+
  theme_classic()+
  facet_wrap(.~taxa, scales = "free_y", ncol=5)+
  xlab("Mean annual temperature (°C)")+
  ylab ("Lag between leafing and pollen phenology (day)")

Linear regression to check significance of the correlation. When looking at individual taxa, only 3 were statistically significant.

# linear regression
reg.df<-lag_clim_df  %>% 
  # filter(region!="SJ") %>% 
  group_by(taxa) %>%
  do(broom::tidy(lm(lag ~ mat, .))) %>%
  filter(term %in% c("mat")) %>%
  dplyr::select( -statistic)
ggplot(reg.df %>% mutate(taxa=fct_relevel(taxa,levels=rev(taxa_chron))))+
  geom_point(aes(x=taxa, y=estimate))+
  geom_errorbar(aes(x=taxa, ymin=estimate-1.95*std.error, ymax=estimate+1.95*std.error))+
  geom_hline(yintercept = 0, lty=2)+
  geom_vline(xintercept = 3.5, lty=1)+
  theme_classic()+
  coord_flip()+
  ylab("Slope (day/°C)")+
  xlab ("Taxa")
## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

## Warning: Outer names are only allowed for unnamed scalar atomic inputs

summary(reg.df$p.value<0.05)
##    Mode   FALSE    TRUE 
## logical      10       3

Linear mixed-effect model with random slope and random intercept to summarize correlation over early and late flowering taxa. * For early-flowering taxa, pollen phenology usually leads leafing phenology. At warmer places, pollen leads leafing even more. This effect was statistically significant. * For early-flowering taxa, pollen phenology usually lags leafing phenology. At warmer places, pollen lags green-down even more. This effect was not statistically significant, possibly because there were only three taxa. It was significant for late-flowering elm trees.

lme.fit<-lme(lag ~ mat, random = ~ 1+mat|taxa, data=lag_clim_df %>% filter(group=="early"),control =lmeControl(opt='optim',optimMethod = "SANN") )
summary(lme.fit)
## Linear mixed-effects model fit by REML
##   Data: lag_clim_df %>% filter(group == "early") 
##        AIC      BIC    logLik
##   713.3482 727.0082 -350.6741
## 
## Random effects:
##  Formula: ~1 + mat | taxa
##  Structure: General positive-definite, Log-Cholesky parametrization
##             StdDev     Corr  
## (Intercept) 47.6913779 (Intr)
## mat          0.9906993 -0.789
## Residual    24.0505312       
## 
## Fixed effects:  lag ~ mat 
##                 Value Std.Error DF    t-value p-value
## (Intercept) 15.681029 17.830123 63  0.8794684  0.3825
## mat         -2.000099  0.714202 63 -2.8004690  0.0068
##  Correlation: 
##     (Intr)
## mat -0.749
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -2.2502434 -0.4759990 -0.1362737  0.2622050  3.3179205 
## 
## Number of Observations: 74
## Number of Groups: 10
lme.fit<-lme(lag ~ mat, random = ~ 1+mat|taxa, data=lag_clim_df %>% filter(group=="late"),control =lmeControl(opt='optim',optimMethod = "SANN"))
summary(lme.fit)
## Linear mixed-effects model fit by REML
##   Data: lag_clim_df %>% filter(group == "late") 
##        AIC      BIC    logLik
##   224.0688 229.7355 -106.0344
## 
## Random effects:
##  Formula: ~1 + mat | taxa
##  Structure: General positive-definite, Log-Cholesky parametrization
##             StdDev   Corr  
## (Intercept) 45.91756 (Intr)
## mat          1.64489 0.228 
## Residual    44.01958       
## 
## Fixed effects:  lag ~ mat 
##                 Value Std.Error DF    t-value p-value
## (Intercept) -37.89290  43.67995 17 -0.8675126  0.3977
## mat           3.44892   2.36474 17  1.4584807  0.1629
##  Correlation: 
##     (Intr)
## mat -0.642
## 
## Standardized Within-Group Residuals:
##        Min         Q1        Med         Q3        Max 
## -1.3832425 -0.5402016 -0.2215449  0.2550384  2.8860598 
## 
## Number of Observations: 21
## Number of Groups: 3